[mlir][Linalg] Avoid using scf.parallel for non-parallel loops in Lin…

…alg ops. Modifying the loop nest builder for generating scf.parallel loops to not generate scf.parallel loops for non-parallel iterator types in Linalg operations. The existing implementation incorrectly generated scf.parallel for all tiled loops. It is rectified by refactoring logic used while lowering to loops that accounted for this. Differential Revision: https://reviews.llvm.org/D80188
llvm · May 27, 2020 · 5759e47 · 5759e47
1 parent a714148
commit 5759e47
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 78 deletions.
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -9,14 +9,21 @@
 #ifndef MLIR_DIALECT_LINALG_UTILS_H_
 #define MLIR_DIALECT_LINALG_UTILS_H_
 
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Linalg/EDSC/Builders.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 
 #include "llvm/ADT/SetVector.h"
 
+using mlir::edsc::intrinsics::AffineIndexedValue;
+using mlir::edsc::intrinsics::StdIndexedValue;
+
 namespace mlir {
 class AffineExpr;
+class AffineForOp;
 class AffineMap;
 class OperationFolder;
 class PatternRewriter;
@@ -49,6 +56,15 @@ struct RegionMatcher {
   static Optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
 };
 
+/// Checks if an iterator_type attribute is parallel.
+bool isParallelIteratorType(Attribute attr);
+
+/// Checks if an iterator_type attribute is parallel.
+bool isReductionIteratorType(Attribute attr);
+
+/// Checks if an iterator_type attribute is parallel.
+bool isWindowIteratorType(Attribute attr);
+
 /// Checks whether the specific `producer` is the last write to exactly the
 /// whole `consumedView`. This checks structural dominance, that the dependence
 /// is a RAW without any interleaved write to any piece of `consumedView`.
@@ -141,6 +157,21 @@ void applyPermutationToVector(SmallVector<T, N> &inVec,
   inVec = auxVec;
 }
 
+/// Utility class used to generate nested loops with ranges described by
+/// `loopRanges` and loop type described by the `iteratorTypes`. `allIvs` is
+/// populated with induction variables for all generated loops on return, with
+/// `fun` used to generate the body of the innermost loop.
+template <typename LoopTy>
+struct GenerateLoopNest {
+  using IndexedValueTy =
+      typename std::conditional<std::is_same<LoopTy, AffineForOp>::value,
+                                AffineIndexedValue, StdIndexedValue>::type;
+  static void doit(MutableArrayRef<Value> allIvs,
+                   ArrayRef<SubViewOp::Range> loopRanges,
+                   ArrayRef<Attribute> iteratorTypes,
+                   std::function<void(void)> fun);
+};
+
 } // namespace linalg
 } // namespace mlir
 

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -487,80 +487,9 @@ class LinalgScopedEmitter<IndexedValueType, IndexedGenericOp> {
   }
 };
 
-namespace {
-/// Helper struct to generate the loop nest for the op. This factored out here
-/// to be able to partially specialize this for different LoopTy.
-template <typename LoopTy, typename ConcreteOpTy>
-class GenerateLoopNest {
-public:
-  using IndexedValueTy =
-      typename std::conditional<std::is_same<LoopTy, AffineForOp>::value,
-                                AffineIndexedValue, StdIndexedValue>::type;
-  static void doit(ConcreteOpTy linalgOp, ArrayRef<SubViewOp::Range> loopRanges,
-                   MutableArrayRef<Value> allIvs) {
-    GenericLoopNestRangeBuilder<LoopTy>(allIvs, loopRanges)([&] {
-      SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
-      LinalgScopedEmitter<IndexedValueTy,
-                          ConcreteOpTy>::emitScalarImplementation(allIvValues,
-                                                                  linalgOp);
-    });
-  }
-};
-
-/// Generates loop nest using scf.parallel. scf.parallel is only used for the
-/// outer parallel loops. All other loops are generated using scf.for
-/// operation.
-template <typename ConcreteOpTy>
-class GenerateLoopNest<scf::ParallelOp, ConcreteOpTy> {
-public:
-  using IndexedValueTy = StdIndexedValue;
-
-  static void doit(ConcreteOpTy linalgOp, ArrayRef<SubViewOp::Range> loopRanges,
-                   MutableArrayRef<Value> allIvs) {
-    // Only generate scf.parallel for outer consecutive "parallel"
-    // iterator_types.
-    // TODO(ravishankarm): Generate scf.parallel for all "parallel" iterator
-    // types, not just the outer most ones. Also handle "reduction" iterator
-    // types.
-    auto nOuterPar = linalgOp.iterator_types()
-                         .getValue()
-                         .take_while([](Attribute attr) {
-                           return attr.cast<StringAttr>().getValue() ==
-                                  getParallelIteratorTypeName();
-                         })
-                         .size();
-    // If there are no outer parallel loops, then number of loop ops is same as
-    // the number of loops, and they are all scf.for ops.
-    if (nOuterPar) {
-      GenericLoopNestRangeBuilder<scf::ParallelOp>(
-          allIvs.take_front(nOuterPar), loopRanges.take_front(nOuterPar))([&] {
-        GenericLoopNestRangeBuilder<scf::ForOp>(
-            allIvs.drop_front(nOuterPar),
-            loopRanges.drop_front(nOuterPar))([&] {
-          SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
-          LinalgScopedEmitter<StdIndexedValue, ConcreteOpTy>::
-              emitScalarImplementation(allIvValues, linalgOp);
-        });
-      });
-    } else {
-      // If there are no parallel loops then fallback to generating all scf.for
-      // operations.
-      GenericLoopNestRangeBuilder<scf::ForOp>(allIvs, loopRanges)([&] {
-        SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
-        LinalgScopedEmitter<StdIndexedValue,
-                            ConcreteOpTy>::emitScalarImplementation(allIvValues,
-                                                                    linalgOp);
-      });
-    }
-  }
-};
-} // namespace
-
 template <typename LoopTy, typename ConcreteOpTy>
 Optional<LinalgLoops> linalgOpToLoopsImpl(Operation *op, OpBuilder &builder) {
-  using Impl = GenerateLoopNest<LoopTy, ConcreteOpTy>;
-  using IndexedValueTy =
-      typename GenerateLoopNest<LoopTy, ConcreteOpTy>::IndexedValueTy;
+  using IndexedValueTy = typename GenerateLoopNest<LoopTy>::IndexedValueTy;
 
   ScopedContext scope(builder, op->getLoc());
 
@@ -591,7 +520,13 @@ Optional<LinalgLoops> linalgOpToLoopsImpl(Operation *op, OpBuilder &builder) {
       emitLoopRanges(scope.getBuilderRef(), scope.getLocation(), invertedMap,
                      getViewSizes(builder, linalgOp));
   assert(loopRanges.size() == allIvs.size());
-  Impl::doit(linalgOp, loopRanges, allIvs);
+  GenerateLoopNest<LoopTy>::doit(
+      allIvs, loopRanges, linalgOp.iterator_types().getValue(), [&] {
+        SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
+        LinalgScopedEmitter<IndexedValueTy,
+                            ConcreteOpTy>::emitScalarImplementation(allIvValues,
+                                                                    linalgOp);
+      });
   // Number of loop ops might be different from the number of ivs since some
   // loops like affine.parallel and scf.parallel have multiple ivs.
   llvm::SetVector<Operation *> loopSet;

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -376,16 +376,20 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
   // 3. Create the tiled loops.
   LinalgOp res = op;
   SmallVector<Value, 4> ivs(loopRanges.size());
-  GenericLoopNestRangeBuilder<LoopTy>(ivs, loopRanges)([&] {
+  SmallVector<Attribute, 4> iteratorTypes =
+      llvm::to_vector<4>(op.iterator_types().cast<ArrayAttr>().getValue());
+  if (!options.interchangeVector.empty())
+    applyPermutationToVector(iteratorTypes, options.interchangeVector);
+  GenerateLoopNest<LoopTy>::doit(ivs, loopRanges, iteratorTypes, [&] {
     auto &b = ScopedContext::getBuilderRef();
     auto loc = ScopedContext::getLocation();
     SmallVector<Value, 4> ivValues(ivs.begin(), ivs.end());
 
     // If we have to apply a permutation to the tiled loop nest, we have to
     // reorder the induction variables This permutation is the right one
     // assuming that loopRanges have previously been permuted by
-    // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation of
-    // that one: (d0,d1,d2)->(d2,d0,d1)
+    // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation
+    // of that one: (d0,d1,d2)->(d2,d0,d1)
     if (!options.interchangeVector.empty())
       ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues);
 

diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/SCF/EDSC/Builders.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExpr.h"
@@ -101,3 +102,91 @@ mlir::linalg::getAssumedNonViewOperands(LinalgOp linalgOp) {
   }
   return res;
 }
+
+bool mlir::linalg::isParallelIteratorType(Attribute attr) {
+  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
+    return strAttr.getValue() == getParallelIteratorTypeName();
+  }
+  return false;
+}
+
+bool mlir::linalg::isReductionIteratorType(Attribute attr) {
+  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
+    return strAttr.getValue() == getReductionIteratorTypeName();
+  }
+  return false;
+}
+
+bool mlir::linalg::isWindowIteratorType(Attribute attr) {
+  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
+    return strAttr.getValue() == getWindowIteratorTypeName();
+  }
+  return false;
+}
+
+/// Explicit instantiation of loop nest generator for different loop types.
+template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
+template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
+template struct mlir::linalg::GenerateLoopNest<AffineForOp>;
+
+/// Specialization of loop nest generator for scf.parallel loops to handle
+/// iterator types that are not parallel. These are generated as sequential
+/// loops.
+template <>
+void mlir::linalg::GenerateLoopNest<scf::ForOp>::doit(
+    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
+    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+  edsc::GenericLoopNestRangeBuilder<scf::ForOp>(allIvs, loopRanges)(fun);
+}
+
+template <>
+void mlir::linalg::GenerateLoopNest<AffineForOp>::doit(
+    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
+    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+  edsc::GenericLoopNestRangeBuilder<AffineForOp>(allIvs, loopRanges)(fun);
+}
+
+template <>
+void mlir::linalg::GenerateLoopNest<scf::ParallelOp>::doit(
+    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
+    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+  // Check if there is nothing to do here. This is also the recursion
+  // termination.
+  if (loopRanges.empty())
+    return;
+  size_t nOuterPar = iteratorTypes.take_front(loopRanges.size())
+                         .take_while(isParallelIteratorType)
+                         .size();
+  if (nOuterPar == 0 && loopRanges.size() == 1)
+    // Generate the sequential for loop for the remaining non-parallel loop.
+    return GenerateLoopNest<scf::ForOp>::doit(allIvs, loopRanges, iteratorTypes,
+                                              fun);
+  if (nOuterPar == 0) {
+    // The immediate outer loop is not parallel. Generate a scf.for op for this
+    // loop, but there might be subsequent loops that are parallel. Use
+    // recursion to find those.
+    auto nestedFn = [&]() {
+      GenerateLoopNest<scf::ParallelOp>::doit(allIvs.drop_front(),
+                                              loopRanges.drop_front(),
+                                              iteratorTypes.drop_front(), fun);
+    };
+    return GenerateLoopNest<scf::ForOp>::doit(allIvs[0], loopRanges[0],
+                                              iteratorTypes[0], nestedFn);
+  }
+  if (nOuterPar == loopRanges.size()) {
+    // All loops are parallel, so generate the scf.parallel op.
+    return edsc::GenericLoopNestRangeBuilder<scf::ParallelOp>(allIvs,
+                                                              loopRanges)(fun);
+  }
+  // Generate scf.parallel for the outer parallel loops. The next inner loop is
+  // sequential, but there might be more parallel loops after that. So recurse
+  // into the same method.
+  auto nestedFn = [&]() {
+    GenerateLoopNest<scf::ParallelOp>::doit(
+        allIvs.drop_front(nOuterPar), loopRanges.drop_front(nOuterPar),
+        iteratorTypes.drop_front(nOuterPar), fun);
+  };
+  return GenerateLoopNest<scf::ParallelOp>::doit(
+      allIvs.take_front(nOuterPar), loopRanges.take_front(nOuterPar),
+      iteratorTypes.take_front(nOuterPar), nestedFn);
+}
diff --git a/mlir/test/Dialect/Linalg/parallel_loops.mlir b/mlir/test/Dialect/Linalg/parallel_loops.mlir
@@ -57,6 +57,42 @@ func @lower_outer_parallel(%A: memref<?x?x?x?xf32>, %B: memref<?x?x?xf32>) {
 //   CHECK-DAG: %[[D3:.*]] = dim %{{.*}}, 3
 //       CHECK: scf.parallel (%[[IV0:.*]], %[[IV1:.*]]) = (%[[C0]], %[[C0]]) to (%[[D0]], %[[D1]]) step (%[[C1]], %[[C1]])
 //       CHECK:   scf.for %[[IV2:.*]] = %[[C0]] to %[[D2]] step %[[C1]]
-//       CHECK:     scf.for %[[IV3:.*]] = %[[C0]] to %[[D3]] step %[[C1]]
+//       CHECK:     scf.parallel (%[[IV3:.*]]) = (%[[C0]]) to (%[[D3]]) step (%[[C1]])
 //       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
 //       CHECK:       store %{{.*}}, %{{.*}}[%[[IV0]], %[[IV1]], %[[IV3]]]
+
+// -----
+
+#accesses = [
+  affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>,
+  affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>
+]
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"],
+  indexing_maps = #accesses
+}
+
+func @lower_mixed_parallel(%A: memref<?x?x?x?x?x?xf32>, %B: memref<?x?x?x?xf32>) {
+  linalg.generic #trait %A, %B {
+    ^bb0(%a: f32, %b: f32):
+      linalg.yield %a: f32
+  } : memref<?x?x?x?x?x?xf32>, memref<?x?x?x?xf32>
+  return
+}
+// CHECK-LABEL: @lower_mixed_parallel
+//   CHECK-DAG: %[[C0:.*]] = constant 0
+//   CHECK-DAG: %[[C1:.*]] = constant 1
+//   CHECK-DAG: %[[D0:.*]] = dim %{{.*}}, 0
+//   CHECK-DAG: %[[D1:.*]] = dim %{{.*}}, 1
+//   CHECK-DAG: %[[D2:.*]] = dim %{{.*}}, 2
+//   CHECK-DAG: %[[D3:.*]] = dim %{{.*}}, 3
+//   CHECK-DAG: %[[D4:.*]] = dim %{{.*}}, 4
+//   CHECK-DAG: %[[D5:.*]] = dim %{{.*}}, 5
+//       CHECK: scf.parallel (%[[IV0:.*]], %[[IV1:.*]]) = (%[[C0]], %[[C0]]) to (%[[D0]], %[[D1]]) step (%[[C1]], %[[C1]])
+//       CHECK:   scf.for %[[IV2:.*]] = %[[C0]] to %[[D2]] step %[[C1]]
+//       CHECK:     scf.parallel (%[[IV3:.*]], %[[IV4:.*]]) = (%[[C0]], %[[C0]]) to (%[[D3]], %[[D4]]) step (%[[C1]], %[[C1]])
+//       CHECK:       scf.for %[[IV5:.*]] = %[[C0]] to %[[D5]] step %[[C1]]
+//       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]], %[[IV4]], %[[IV5]]]
+//       CHECK:       store %{{.*}}, %{{.*}}[%[[IV0]], %[[IV2]], %[[IV4]], %[[IV5]]]