/
gemm_tiling.nim
249 lines (217 loc) · 8.45 KB
/
gemm_tiling.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# Laser
# Copyright (c) 2018 Mamy André-Ratsimbazafy
# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
# This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Cache and register optimizations
#
# ############################################################
# Papers:
# [1] Anatomy of High-Performance Matrix Multiplication (Revised)
# Kazushige Goto, Robert A. Van de Geijn
# - http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf
#
# [2] Anatomy of High-Performance Many-Threaded Matrix Multiplication
# Smith et al
# - http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf
#
# [3] Automating the Last-Mile for High Performance Dense Linear Algebra
# Veras et al
# - https://arxiv.org/pdf/1611.08035.pdf
#
# [4] GEMM: From Pure C to SSE Optimized Micro Kernels
# Michael Lehn
# - http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/index.html
#
# Laser wiki - GEMM optimization resources
# - https://github.com/numforge/laser/wiki/GEMM-optimization-resources
import
../../cpuinfo, ../../compiler_optim_hints,
../../private/memory,
typetraits, macros,
./gemm_utils
# ############################################################
#
# Microkernel (µkernel)
#
# ############################################################
# We have to take into account vectorisation
# so that the microkernel can be processed with vectorized intrinsics.
#
# Caux [mr, nr] must stay in register.
# - mr ~= nr is optimal to amortize register load cost
# - some registers must be left to prefetch à and ~B (PackedA and PackedB)
# - nr >= (flops/cycle) / (bytes/cycle) * sizeof(element)
#
# For example Haswell is capable of
# - 32 single-precision FLOPs/cycle
# - 32 bytes/cycle store and 64 bytes/cycle load (store C, load A and B)
#
# so nr >= 32/32 * 4
# For that number of FLOP it must schedule
# 2xFMA so consume 16 single-precision float
# so mr*nr >= 16
type
MicroKernel* = object
mr*, nr*: int
cpu_simd*: CPUFeatureX86
nb_scalars*: int # Ideally MicroKernel should be generic over T
nb_vecs_nr*: int
c_unit_stride*: bool # We can use SIMD for the epilogue of C has a unit_stride
# TODO: ARM support
# - https://github.com/nim-lang/Nim/issues/9679
# - https://github.com/nim-lang/Nim/issues/9678
CPUFeatureX86* = enum
x86_Generic,
x86_SSE,
x86_SSE2,
x86_SSE4_1,
x86_AVX,
x86_AVX_FMA,
x86_AVX2
# x86_AVX512 # TODO
# Note that Skylake SP, Xeon Bronze Silver and Gold 5XXX
# only have a single AVX512 port and AVX2 can be faster.
X86_FeatureMap = array[CPUFeatureX86, int]
const X86_vecsize_float: X86_FeatureMap = [
x86_Generic: 1,
x86_SSE: 128 div 8,
x86_SSE2: 128 div 8,
x86_SSE4_1: 128 div 8,
x86_AVX: 256 div 8,
x86_AVX_FMA: 256 div 8,
x86_AVX2: 256 div 8
# x86_AVX512: 512 div 8
]
const X86_vecsize_int: X86_FeatureMap = [
x86_Generic: 1,
x86_SSE: 1,
x86_SSE2: 128 div 8,
x86_SSE4_1: 128 div 8,
x86_AVX: 128 div 8, # Not even addition with integer AVX
x86_AVX_FMA: 128 div 8,
x86_AVX2: 256 div 8,
# x86_AVX512: 512 div 8
]
# mr * nr < number of registers - 4
# 4 registers are needed to hold à and ~B and loop index
const X86_regs: X86_FeatureMap = [
x86_Generic: 2,
x86_SSE: 2, # 8 XMM regs in 32-bit, 16 in 64-bit (we assume 32-bit mode)
x86_SSE2: 6,
x86_SSE4_1: 6,
x86_AVX: 6, # 16 YMM registers
x86_AVX_FMA: 6,
x86_AVX2: 6
# x86_AVX512: 6 # 32 ZMM registers
]
func x86_ukernel*(cpu: CPUFeatureX86, T: typedesc, c_unit_stride: bool): MicroKernel =
result.cpu_simd = cpu
result.c_unit_stride = c_unit_stride
when T is SomeFloat:
result.nb_scalars = max(1, X86_vecsize_float[cpu] div T.sizeof)
elif T is SomeInteger: # Integers
result.nb_scalars = max(1, X86_vecsize_int[cpu] div T.sizeof)
else:
{.error: "Unsupported type: " & T.type.name.}
# The inner microkernel loop does:
# AB[m][n] = A[m] * B[n]
# So n should be the vector size
# if most matrices are row-Major.
# This avoids dealing with transpose
# in the inner loop and untranspose in the epilogue
result.mr = X86_regs[cpu] # 2~6 registers for the rows of Ã
result.nb_vecs_nr = 2 # x2 for 2 SIMD vectors of B
result.nr = result.nb_vecs_nr * result.nb_scalars
#############################################
# Workaround "undeclared identifier mr or nr"
# for some reason the compiler cannot access fields in
# the static MicroKernel.
macro extract_mr*(ukernel: static MicroKernel): untyped =
result = newLit ukernel.mr
macro extract_nr*(ukernel: static MicroKernel): untyped =
result = newLit ukernel.nr
macro extract_cpu_simd*(ukernel: static MicroKernel): untyped =
let simd = ukernel.cpu_simd
result = quote do: CPUFeatureX86(`simd`)
macro extract_nb_scalars*(ukernel: static MicroKernel): untyped =
result = newLit ukernel.nb_scalars
macro extract_nb_vecs_nr*(ukernel: static MicroKernel): untyped =
result = newLit ukernel.nb_vecs_nr
macro extract_c_unit_stride*(ukernel: static MicroKernel): untyped =
result = newLit ukernel.c_unit_stride
# ############################################################
#
# Loop tiling
#
# ############################################################
# multithreading info in [2] and https://github.com/flame/blis/blob/master/docs/Multithreading.md
type Tiles*[T] = ref object
a*: ptr UncheckedArray[T]
b*: ptr UncheckedArray[T]
mc*, nc*, kc*: int
# Multithreading
# Nim doesn't support arbitrary increment with OpenMP
# So we store indexing/edge case data in tiles for the parallelized loop
ic_num_mc_tiles*: int # For private L1-L2 and shared L3
# jr_num_nr_tiles*: int # For private L1 and shared L2 cache
# Allocation data
# TODO Save on cache line, use an aligned allocator to not track this
a_alloc_mem: pointer
b_alloc_mem: pointer
# The Tiles data structure takes 64-byte = 1 cache-line
proc deallocTiles[T](tiles: Tiles[T]) =
if not tiles.a_alloc_mem.isNil:
deallocShared tiles.a_alloc_mem
if not tiles.b_alloc_mem.isNil:
deallocShared tiles.b_alloc_mem
proc newTiles*(
ukernel: static MicroKernel,
T: typedesc,
M, N, K: Natural,
): Tiles[T] =
# BLIS paper [2] section II Figure 2:
# - kc * nr in L1 cache µkernel
# - mc * kc in L2 cache Ã
# - kc * nc in L3 cache ~B (no L3 in Xeon Phi ¯\_(ツ)_/¯)
new result, deallocTiles[T]
const
nr = ukernel.nr
mr = ukernel.mr
result.nc = N # We don't partition over N
# ## Panel sizes
# - TLB constraint
# TA ̃ + 2(TBj + TCj)≤T
# Note: optimizing the similar problem mckc/(2mc+2kc)
# under the constraint that mckc ≤ K is the problem
# of maximizing the area of a rectangle
# while minimizing the perimeter,
#
# Goto paper [1] section 6.3: choosing kc
# - kc should be as large as possible to amortize the mr*nr updates of Cj
# - Elements from Bj [kc, nr] must remain in L1 cache.
# - kc * nr should occupy less than half the L1 cache
# so that à and Caux do not evict element of Bj
# - Ã [kc, mc] should occupy
# a considerable fraction of the L2 cache
# In our experience optimal choice is so that "kc" float64 occupy half a page
# -> a page is 4096 bytes = 512 float64 -> half a page = 256
# Goto paper [1] section 6.3: choosing mc
# - mc*kc should fill a considerable part of (1) the memory addressable
# by the TLB and (2) the L2 cache
# In practice mc is chosen so that A occupies about half the smaller of (1) and (2)
# TODO: heuristics to compute the size
result.mc = min( 768 div T.sizeof, M)
result.kc = min(2048 div T.sizeof, K)
# Parallel config
# Ic loop parallel means that each thread will share a panel B and pack a different A
result.ic_num_mc_tiles = (M+result.mc-1) div result.mc
# Packing
# During packing the max size is unroll_stop*kc+kc*LR, LR = MR or NR
let bufA_size = T.sizeof * result.kc*(result.mc+mr) * result.ic_num_mc_tiles
let bufB_size = T.sizeof * result.kc*(result.nc+nr)
result.a_alloc_mem = allocShared(bufA_size + 63)
result.b_alloc_mem = allocShared(bufB_size + 63)
result.a = assume_aligned align_raw_data(T, result.a_alloc_mem)
result.b = assume_aligned align_raw_data(T, result.b_alloc_mem)