[mlir][Inliner] Use llvm::parallelForEach instead of llvm::parallelTr…

…ansformReduce llvm::parallelTransformReduce does not schedule work on the caller thread, which becomes very costly for the inliner where a majority of SCCs are small, often ~1 element. The switch to llvm::parallelForEach solves this, and also aligns the implementation with the PassManager (which realistically should share the same implementation). This change dropped compile time on an internal benchmark by ~1(25%) second. Differential Revision: https://reviews.llvm.org/D96086
llvm · Feb 23, 2021 · abd3c6f · abd3c6f
1 parent 65a3197
commit abd3c6f
Showing 1 changed file with 17 additions and 12 deletions.
diff --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp
@@ -688,8 +688,10 @@ InlinerPass::optimizeSCCAsync(MutableArrayRef<CallGraphNode *> nodesToVisit,
                               MLIRContext *context) {
   // Ensure that there are enough pipeline maps for the optimizer to run in
   // parallel.
-  size_t numThreads = llvm::hardware_concurrency().compute_thread_count();
-  if (opPipelines.size() != numThreads) {
+  size_t numThreads =
+      std::min((size_t)llvm::hardware_concurrency().compute_thread_count(),
+               nodesToVisit.size());
+  if (opPipelines.size() < numThreads) {
     // Reserve before resizing so that we can use a reference to the first
     // element.
     opPipelines.reserve(numThreads);
@@ -706,14 +708,11 @@ InlinerPass::optimizeSCCAsync(MutableArrayRef<CallGraphNode *> nodesToVisit,
 
   // Optimize the nodes of the SCC in parallel.
   ParallelDiagnosticHandler optimizerHandler(context);
-  return llvm::parallelTransformReduce(
-      llvm::seq<size_t>(0, numThreads), success(),
-      [](LogicalResult lhs, LogicalResult rhs) {
-        return success(succeeded(lhs) && succeeded(rhs));
-      },
-      [&](size_t index) {
-        LogicalResult result = success();
-        for (auto e = nodesToVisit.size(); nodeIt < e && succeeded(result);) {
+  std::atomic<bool> passFailed(false);
+  llvm::parallelForEach(
+      opPipelines.begin(), std::next(opPipelines.begin(), numThreads),
+      [&](llvm::StringMap<OpPassManager> &pipelines) {
+        for (auto e = nodesToVisit.size(); !passFailed && nodeIt < e;) {
           // Get the next available operation index.
           unsigned nextID = nodeIt++;
           if (nextID >= e)
@@ -722,11 +721,17 @@ InlinerPass::optimizeSCCAsync(MutableArrayRef<CallGraphNode *> nodesToVisit,
           // Set the order for this thread so that diagnostics will be
           // properly ordered, and reset after optimization has finished.
           optimizerHandler.setOrderIDForThread(nextID);
-          result = optimizeCallable(nodesToVisit[nextID], opPipelines[index]);
+          LogicalResult pipelineResult =
+              optimizeCallable(nodesToVisit[nextID], pipelines);
           optimizerHandler.eraseOrderIDForThread();
+
+          if (failed(pipelineResult)) {
+            passFailed = true;
+            break;
+          }
         }
-        return result;
       });
+  return failure(passFailed);
 }
 
 LogicalResult