Skip to content

Commit

Permalink
The order of the loops defines the data reused in the BLIS implementa…
Browse files Browse the repository at this point in the history
…tion of

gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806
  • Loading branch information
gareevroman committed Dec 15, 2016
1 parent 552c8e9 commit 8babe1a
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 160 deletions.
25 changes: 19 additions & 6 deletions polly/include/polly/ScheduleOptimizer.h
Expand Up @@ -134,16 +134,29 @@ class ScheduleTreeOptimizer {
/// a micro-kernel. The micro-kernel is a loop around a rank-1
/// (i.e., outer product) update.
///
/// For a detailed description please see:
/// Analytical Modeling is Enough for High Performance BLIS
/// For a detailed description please see [1].
///
/// The order of the loops defines the data reused in the BLIS implementation
/// of gemm ([1]). In particular, elements of the matrix B, the second
/// operand of matrix multiplication, are reused between iterations of the
/// innermost loop. To keep the reused data in cache, only elements of matrix
/// A, the first operand of matrix multiplication, should be evicted during
/// an iteration of the innermost loop. To provide such a cache replacement
/// policy, elements of the matrix A can, in particular, be loaded first and,
/// consequently, be least-recently-used.
///
/// In our case matrices are stored in row-major order instead of
/// column-major order used in the BLIS implementation ([1]). It affects only
/// on the form of the BLIS micro kernel and the computation of its
/// parameters. In particular, reused elements of the matrix B are
/// successively multiplied by specific elements of the matrix A.
///
/// Refs.:
/// [1] - Analytical Modeling is Enough for High Performance BLIS
/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
/// Technical Report, 2014
/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
///
/// In our case matrices are stored in row-major order, which is taken into
/// account during the creation of the BLIS kernels and the computation
/// of their parameters.
///
/// @see ScheduleTreeOptimizer::createMicroKernel
/// @see ScheduleTreeOptimizer::createMacroKernel
/// @see getMicroKernelParams
Expand Down
60 changes: 31 additions & 29 deletions polly/lib/Transform/ScheduleOptimizer.cpp
Expand Up @@ -538,8 +538,10 @@ permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,

__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
__isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
1);
applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
Node = permuteBandNodeDimensions(Node, 0, 1);
return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
Expand All @@ -553,6 +555,7 @@ __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
{MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
Node = permuteBandNodeDimensions(Node, 1, 2);
Node = permuteBandNodeDimensions(Node, 0, 2);
return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

Expand Down Expand Up @@ -609,18 +612,15 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
return {1, 1, 1};
int Cbr = floor(
(CacheLevelAssociativity[0] - 1) /
(1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr));
(1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
int Kc = (Cbr * CacheLevelSizes[0]) /
(MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8);
double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 *
CacheLevelAssociativity[1]) /
(MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8);
double Cac = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
CacheLevelSizes[1];
double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 *
CacheLevelAssociativity[1]) /
double Cbc = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
CacheLevelSizes[1];
int Mc = floor(MicroKernelParams.Mr / Cac);
int Nc =
floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc);
int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac);
int Nc = floor(1 / Cbc);
return {Mc, Nc, Kc};
}

Expand Down Expand Up @@ -867,36 +867,38 @@ static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
Node = isl_schedule_node_parent(Node);
Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
auto *AccRel =
getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 6);
unsigned FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr;
unsigned SecondDimSize = MicroParams.Mr;
getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 7);
unsigned FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr;
unsigned SecondDimSize = MicroParams.Nr;
auto *SAI = Stmt->getParent()->createScopArrayInfo(
MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize});
MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize});
AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
auto *OldAcc = MemAccessA->getAccessRelation();
MemAccessA->setNewAccessRelation(AccRel);
auto *OldAcc = MemAccessB->getAccessRelation();
MemAccessB->setNewAccessRelation(AccRel);
auto *ExtMap =
getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 1, 1);
getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
auto *Domain = Stmt->getDomain();
auto *NewStmt = Stmt->getParent()->addScopStmt(
OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
Node = createExtensionNode(Node, ExtMap);
Node = isl_schedule_node_child(Node, 0);
AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 7);
FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr;
SecondDimSize = MicroParams.Nr;
AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 6);
FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr;
SecondDimSize = MicroParams.Mr;
SAI = Stmt->getParent()->createScopArrayInfo(
MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize});
MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize});
AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
OldAcc = MemAccessB->getAccessRelation();
MemAccessB->setNewAccessRelation(AccRel);
ExtMap = getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 1, 1);
OldAcc = MemAccessA->getAccessRelation();
MemAccessA->setNewAccessRelation(AccRel);
ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
NewStmt = Stmt->getParent()->addScopStmt(
OldAcc, MemAccessB->getAccessRelation(), Domain);
OldAcc, MemAccessA->getAccessRelation(), Domain);
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
Node = createExtensionNode(Node, ExtMap);
Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
Expand Down
50 changes: 25 additions & 25 deletions polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
Expand Up @@ -9,37 +9,37 @@
; C[i][j] += alpha * A[i][k] * B[k][j];
; }
;
; CHECK: double Packed_A[ { [] -> [(1024)] } ][ { [] -> [(4)] } ]; // Element size 8
; CHECK: double Packed_B[ { [] -> [(3072)] } ][ { [] -> [(8)] } ]; // Element size 8
; CHECK: double Packed_B[ { [] -> [(512)] } ][ { [] -> [(8)] } ]; // Element size 8
; CHECK-NEXT: double Packed_A[ { [] -> [(6144)] } ][ { [] -> [(4)] } ]; // Element size 8
;
; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };
; CHECK: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 16*floor((i0)/16) <= 4*floor((o0)/256) <= i0 - 16*floor((i0)/16) };
; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96*floor((i0)/96) <= 4*floor((o0)/256) <= i0 - 96*floor((i0)/96) };
;
; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
; CHECK: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 96*floor((i1)/96) <= 8*floor((o0)/256) <= i1 - 96*floor((i1)/96) };
; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16*floor((i1)/16) <= 8*floor((o0)/256) <= i1 - 16*floor((i1)/16) };
;
; CHECK: CopyStmt_0
; CHECK: Domain :=
; CHECK: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
; CHECK: Schedule :=
; CHECK: ;
; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK: null;
; CHECK: new: { CopyStmt_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 16*floor((i0)/16) <= 4*floor((o0)/256) <= i0 - 16*floor((i0)/16) };
; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK: null;
; CHECK: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };
; CHECK: CopyStmt_1
; CHECK: Domain :=
; CHECK: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
; CHECK: Schedule :=
; CHECK: ;
; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK: null;
; CHECK: new: { CopyStmt_1[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 96*floor((i1)/96) <= 8*floor((o0)/256) <= i1 - 96*floor((i1)/96) };
; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK: null;
; CHECK: new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg7[i2, i1] };
; CHECK-NEXT: Domain :=
; CHECK-NEXT: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
; CHECK-NEXT: Schedule :=
; CHECK-NEXT: ;
; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: null;
; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16*floor((i1)/16) <= 8*floor((o0)/256) <= i1 - 16*floor((i1)/16) };
; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: null;
; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
; CHECK-NEXT: CopyStmt_1
; CHECK-NEXT: Domain :=
; CHECK-NEXT: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
; CHECK-NEXT: Schedule :=
; CHECK-NEXT: ;
; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: null;
; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96*floor((i0)/96) <= 4*floor((o0)/256) <= i0 - 96*floor((i0)/96) };
; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: null;
; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg6[i0, i2] };
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
Expand Down

0 comments on commit 8babe1a

Please sign in to comment.