The order of the loops defines the data reused in the BLIS implementa…

…tion of gemm ([1]). In particular, elements of the matrix B, the second operand of matrix multiplication, are reused between iterations of the innermost loop. To keep the reused data in cache, only elements of matrix A, the first operand of matrix multiplication, should be evicted during an iteration of the innermost loop. To provide such a cache replacement policy, elements of the matrix A can, in particular, be loaded first and, consequently, be least-recently-used. In our case matrices are stored in row-major order instead of column-major order used in the BLIS implementation ([1]). One of the ways to address it is to accordingly change the order of the loops of the loop nest. However, it makes elements of the matrix A to be reused in the innermost loop and, consequently, requires to load elements of the matrix B first. Since the LLVM vectorizer always generates loads from the matrix A before loads from the matrix B and we can not provide it. Consequently, we only change the BLIS micro kernel and the computation of its parameters instead. In particular, reused elements of the matrix B are successively multiplied by specific elements of the matrix A . Refs.: [1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: https://reviews.llvm.org/D25653 llvm-svn: 289806
llvm · Dec 15, 2016 · 8babe1a · 8babe1a
1 parent 552c8e9
commit 8babe1a
Show file tree

Hide file tree

Showing 4 changed files with 184 additions and 160 deletions.
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h
@@ -134,16 +134,29 @@ class ScheduleTreeOptimizer {
   /// a micro-kernel. The micro-kernel is a loop around a rank-1
   /// (i.e., outer product) update.
   ///
-  /// For a detailed description please see:
-  /// Analytical Modeling is Enough for High Performance BLIS
+  /// For a detailed description please see [1].
+  ///
+  /// The order of the loops defines the data reused in the BLIS implementation
+  /// of gemm ([1]). In particular, elements of the matrix B, the second
+  /// operand of matrix multiplication, are reused between iterations of the
+  /// innermost loop. To keep the reused data in cache, only elements of matrix
+  /// A, the first operand of matrix multiplication, should be evicted during
+  /// an iteration of the innermost loop. To provide such a cache replacement
+  /// policy, elements of the matrix A can, in particular, be loaded first and,
+  /// consequently, be least-recently-used.
+  ///
+  /// In our case matrices are stored in row-major order instead of
+  /// column-major order used in the BLIS implementation ([1]). It affects only
+  /// on the form of the BLIS micro kernel and the computation of its
+  /// parameters. In particular, reused elements of the matrix B are
+  /// successively multiplied by specific elements of the matrix A.
+  ///
+  /// Refs.:
+  /// [1] - Analytical Modeling is Enough for High Performance BLIS
   /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
   /// Technical Report, 2014
   /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
   ///
-  /// In our case matrices are stored in row-major order, which is taken into
-  /// account during the creation of the BLIS kernels and the computation
-  /// of their parameters.
-  ///
   /// @see ScheduleTreeOptimizer::createMicroKernel
   /// @see ScheduleTreeOptimizer::createMacroKernel
   /// @see getMicroKernelParams

diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -538,8 +538,10 @@ permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
 
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
-  return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
-                             1);
+  applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
+  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
+  Node = permuteBandNodeDimensions(Node, 0, 1);
+  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 }
 
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
@@ -553,6 +555,7 @@ __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
   Node = permuteBandNodeDimensions(Node, 1, 2);
+  Node = permuteBandNodeDimensions(Node, 0, 2);
   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 }
 
@@ -609,18 +612,15 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
     return {1, 1, 1};
   int Cbr = floor(
       (CacheLevelAssociativity[0] - 1) /
-      (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr));
+      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
   int Kc = (Cbr * CacheLevelSizes[0]) /
-           (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8);
-  double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 *
-                                   CacheLevelAssociativity[1]) /
+           (MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8);
+  double Cac = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
                CacheLevelSizes[1];
-  double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 *
-                                   CacheLevelAssociativity[1]) /
+  double Cbc = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
                CacheLevelSizes[1];
-  int Mc = floor(MicroKernelParams.Mr / Cac);
-  int Nc =
-      floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc);
+  int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac);
+  int Nc = floor(1 / Cbc);
   return {Mc, Nc, Kc};
 }
 
@@ -867,36 +867,38 @@ static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
   Node = isl_schedule_node_parent(Node);
   Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
   auto *AccRel =
-      getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 6);
-  unsigned FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr;
-  unsigned SecondDimSize = MicroParams.Mr;
+      getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 7);
+  unsigned FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr;
+  unsigned SecondDimSize = MicroParams.Nr;
   auto *SAI = Stmt->getParent()->createScopArrayInfo(
-      MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize});
+      MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize});
   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-  auto *OldAcc = MemAccessA->getAccessRelation();
-  MemAccessA->setNewAccessRelation(AccRel);
+  auto *OldAcc = MemAccessB->getAccessRelation();
+  MemAccessB->setNewAccessRelation(AccRel);
   auto *ExtMap =
-      getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
-  ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 1, 1);
+      getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
+  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
+  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
+  ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
   auto *Domain = Stmt->getDomain();
   auto *NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
+      OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
   Node = createExtensionNode(Node, ExtMap);
   Node = isl_schedule_node_child(Node, 0);
-  AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 7);
-  FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr;
-  SecondDimSize = MicroParams.Nr;
+  AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 6);
+  FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr;
+  SecondDimSize = MicroParams.Mr;
   SAI = Stmt->getParent()->createScopArrayInfo(
-      MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize});
+      MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize});
   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-  OldAcc = MemAccessB->getAccessRelation();
-  MemAccessB->setNewAccessRelation(AccRel);
-  ExtMap = getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
-  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 1, 1);
+  OldAcc = MemAccessA->getAccessRelation();
+  MemAccessA->setNewAccessRelation(AccRel);
+  ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
+  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
   NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MemAccessB->getAccessRelation(), Domain);
+      OldAcc, MemAccessA->getAccessRelation(), Domain);
   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
   Node = createExtensionNode(Node, ExtMap);
   Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);

diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
@@ -9,37 +9,37 @@
 ;	     C[i][j] += alpha * A[i][k] * B[k][j];
 ;        }
 ;
-; CHECK:        double Packed_A[ { [] -> [(1024)] } ][ { [] -> [(4)] } ]; // Element size 8
-; CHECK:        double Packed_B[ { [] -> [(3072)] } ][ { [] -> [(8)] } ]; // Element size 8
+; CHECK:        double Packed_B[ { [] -> [(512)] } ][ { [] -> [(8)] } ]; // Element size 8
+; CHECK-NEXT:        double Packed_A[ { [] -> [(6144)] } ][ { [] -> [(4)] } ]; // Element size 8
 ;
 ; CHECK:                { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };
-; CHECK:           new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 16*floor((i0)/16) <= 4*floor((o0)/256) <= i0 - 16*floor((i0)/16) };
+; CHECK-NEXT:           new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96*floor((i0)/96) <= 4*floor((o0)/256) <= i0 - 96*floor((i0)/96) };
 ;
 ; CHECK:                { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
-; CHECK:           new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 96*floor((i1)/96) <= 8*floor((o0)/256) <= i1 - 96*floor((i1)/96) };
+; CHECK-NEXT:           new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16*floor((i1)/16) <= 8*floor((o0)/256) <= i1 - 16*floor((i1)/16) };
 ;
 ; CHECK:    	CopyStmt_0
-; CHECK:            Domain :=
-; CHECK:                { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
-; CHECK:            Schedule :=
-; CHECK:                ;
-; CHECK:            MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; CHECK:                null;
-; CHECK:           new: { CopyStmt_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 16*floor((i0)/16) <= 4*floor((o0)/256) <= i0 - 16*floor((i0)/16) };
-; CHECK:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; CHECK:                null;
-; CHECK:           new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };
-; CHECK:    	CopyStmt_1
-; CHECK:            Domain :=
-; CHECK:                { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
-; CHECK:            Schedule :=
-; CHECK:                ;
-; CHECK:            MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; CHECK:                null;
-; CHECK:           new: { CopyStmt_1[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 96*floor((i1)/96) <= 8*floor((o0)/256) <= i1 - 96*floor((i1)/96) };
-; CHECK:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; CHECK:                null;
-; CHECK:           new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg7[i2, i1] };
+; CHECK-NEXT:            Domain :=
+; CHECK-NEXT:                { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
+; CHECK-NEXT:            Schedule :=
+; CHECK-NEXT:                ;
+; CHECK-NEXT:            MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                null;
+; CHECK-NEXT:           new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16*floor((i1)/16) <= 8*floor((o0)/256) <= i1 - 16*floor((i1)/16) };
+; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                null;
+; CHECK-NEXT:           new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
+; CHECK-NEXT:    	CopyStmt_1
+; CHECK-NEXT:            Domain :=
+; CHECK-NEXT:                { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
+; CHECK-NEXT:            Schedule :=
+; CHECK-NEXT:                ;
+; CHECK-NEXT:            MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                null;
+; CHECK-NEXT:           new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96*floor((i0)/96) <= 4*floor((o0)/256) <= i0 - 96*floor((i0)/96) };
+; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                null;
+; CHECK-NEXT:           new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg6[i0, i2] };
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"