A new algorithm for identification of a SCoP statement that implement…

… a matrix multiplication The current identification of a SCoP statement that implement a matrix multiplication does not help to identify different permutations of loops that contain it and check for dependencies, which can prevent it from being optimized. It also requires external determination of the operands of the matrix multiplication. This patch contains the implementation of a new algorithm that helps to avoid these issues. It also modifies the test cases that generate matrix multiplications with linearized accesses, because the new algorithm does not support them. Reviewed-by: Michael Kruse <llvm@meinersbur.de>, Tobias Grosser <tobias@grosser.es> Differential Revision: https://reviews.llvm.org/D28357 llvm-svn: 293890
llvm · Feb 2, 2017 · 98075fe · 98075fe
1 parent 32cd9b4
commit 98075fe
Show file tree

Hide file tree

Showing 5 changed files with 624 additions and 491 deletions.
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h
@@ -12,6 +12,7 @@
 #ifndef POLLY_SCHEDULE_OPTIMIZER_H
 #define POLLY_SCHEDULE_OPTIMIZER_H
 
+#include "polly/DependenceInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "isl/ctx.h"
@@ -42,6 +43,31 @@ struct MacroKernelParamsTy {
 };
 
 namespace polly {
+/// Additional parameters of the schedule optimizer.
+///
+/// Target Transform Info and the SCoP dependencies used by the schedule
+/// optimizer.
+///
+struct OptimizerAdditionalInfoTy {
+  const llvm::TargetTransformInfo *TTI;
+  const Dependences *D;
+};
+
+/// Parameters of the matrix multiplication operands.
+///
+/// Parameters, which describe access relations that represent operands of the
+/// matrix multiplication.
+///
+struct MatMulInfoTy {
+  MemoryAccess *A = nullptr;
+  MemoryAccess *B = nullptr;
+  MemoryAccess *ReadFromC = nullptr;
+  MemoryAccess *WriteToC = nullptr;
+  int i = -1;
+  int j = -1;
+  int k = -1;
+};
+
 extern bool DisablePollyTiling;
 class Scop;
 } // namespace polly
@@ -59,11 +85,11 @@ class ScheduleTreeOptimizer {
   ///
   /// @param Schedule The schedule object the transformations will be applied
   ///                 to.
-  /// @param TTI      Target Transform Info.
+  /// @param OAI      Target Transform Info and the SCoP dependencies.
   /// @returns        The transformed schedule.
   static __isl_give isl_schedule *
   optimizeSchedule(__isl_take isl_schedule *Schedule,
-                   const llvm::TargetTransformInfo *TTI = nullptr);
+                   const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
 
   /// Apply schedule tree transformations.
   ///
@@ -75,11 +101,11 @@ class ScheduleTreeOptimizer {
   ///   - Prevectorization
   ///
   /// @param Node The schedule object post-transformations will be applied to.
-  /// @param TTI  Target Transform Info.
+  /// @param OAI  Target Transform Info and the SCoP dependencies.
   /// @returns    The transformed schedule.
   static __isl_give isl_schedule_node *
   optimizeScheduleNode(__isl_take isl_schedule_node *Node,
-                       const llvm::TargetTransformInfo *TTI = nullptr);
+                       const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
 
   /// Decide if the @p NewSchedule is profitable for @p S.
   ///
@@ -128,10 +154,11 @@ class ScheduleTreeOptimizer {
 
   /// Apply the BLIS matmul optimization pattern.
   ///
-  /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three
-  /// nested loops around a macro-kernel, plus two packing routines.
-  /// The macro-kernel is implemented in terms of two additional loops around
-  /// a micro-kernel. The micro-kernel is a loop around a rank-1
+  /// Make the loops containing the matrix multiplication be the innermost
+  /// loops and apply the BLIS matmul optimization pattern. BLIS implements
+  /// gemm as three nested loops around a macro-kernel, plus two packing
+  /// routines. The macro-kernel is implemented in terms of two additional
+  /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
   /// (i.e., outer product) update.
   ///
   /// For a detailed description please see [1].
@@ -167,9 +194,13 @@ class ScheduleTreeOptimizer {
   /// @param Node The node that contains a band to be optimized. The node
   ///             is required to successfully pass
   ///             ScheduleTreeOptimizer::isMatrMultPattern.
+  /// @param TTI  Target Transform Info.
+  /// @param MMI  Parameters of the matrix multiplication operands.
+  /// @returns    The transformed schedule.
   static __isl_give isl_schedule_node *
   optimizeMatMulPattern(__isl_take isl_schedule_node *Node,
-                        const llvm::TargetTransformInfo *TTI);
+                        const llvm::TargetTransformInfo *TTI,
+                        polly::MatMulInfoTy &MMI);
 
   /// Check if this node is a band node we want to tile.
   ///
@@ -266,7 +297,11 @@ class ScheduleTreeOptimizer {
   /// the one used to get close-to-peak performance of matrix multiplications.
   ///
   /// @param Node The node to check.
-  static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node);
+  /// @param D    The SCoP dependencies.
+  /// @param MMI  Parameters of the matrix multiplication operands.
+  static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node,
+                                const polly::Dependences *D,
+                                polly::MatMulInfoTy &MMI);
 
   /// Create the BLIS macro-kernel.
   ///