/
gemm_bench_float32.nim
329 lines (272 loc) · 10.4 KB
/
gemm_bench_float32.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# Apache v2 License
# Mamy Ratsimbazafy
# ##########################################
# Benchmarking tools
import random, times, stats, strformat, math, sequtils
proc warmup() =
# Warmup - make sure cpu is on max perf
let start = epochTime()
var foo = 123
for i in 0 ..< 300_000_000:
foo += i*i mod 456
foo = foo mod 789
# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
let stop = epochTime()
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)"
template printStats(name: string, output: openarray) {.dirty.} =
echo "\n" & name
echo &"Collected {stats.n} samples in {global_stop - global_start:>4.3f} seconds"
echo &"Average time: {stats.mean * 1000 :>4.3f} ms"
echo &"Stddev time: {stats.standardDeviationS * 1000 :>4.3f} ms"
echo &"Min time: {stats.min * 1000 :>4.3f} ms"
echo &"Max time: {stats.max * 1000 :>4.3f} ms"
echo &"Perf: {req_ops.float / stats.mean / float(10^9):>4.3f} GFLOP/s"
echo "\nDisplay output[0] to make sure it's not optimized away"
echo output[0] # Prevents compiler from optimizing stuff away
template bench(name: string, initialisation, body: untyped) {.dirty.}=
block: # Actual bench
var stats: RunningStat
let global_start = epochTime()
for _ in 0 ..< nb_samples:
initialisation
let start = epochTime()
body
let stop = epochTime()
stats.push stop - start
let global_stop = epochTime()
printStats(name, output)
# #############################################
# Params
import
./gemm_common,
../blas,
./arraymancer/blas_l3_gemm,
../../laser/primitives/matrix_multiplication/gemm
const
M = 16*6*20
K = 16*6*20
N = 16*6*20
NbSamples = 10 # This might stresss the allocator when packing if the matrices are big
CpuGhz = 2.7 # Assuming no turbo
NumCpuCores = 2
CpuFlopCycle = 32 # AVX2: 2xFMA/cycle = 2x8x2 - 2 x 8 floats x (1 add + 1 mul)
const
ashape: MatrixShape = (M, K)
bshape: MatrixShape = (K, N)
let req_ops = gemm_required_ops(ashape, bshape)
let req_bytes = sizeof(float32) * gemm_required_data(ashape, bshape)
let out_shape: MatrixShape = gemm_out_shape(ashape, bshape)
let out_size = out_shape.M * out_shape.N
# #############################################
# C and C++ FFI
import ospaths, strutils
from os import DirSep
const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0] & '/'
# #############################################
proc benchOpenBLAS(a, b: seq[float32], nb_samples: int) =
var output = newSeq[float32](out_size)
bench("OpenBLAS benchmark"):
# Initialisation, not measured apart for the "Collected n samples in ... seconds"
zeroMem(output[0].addr, out_size) # We zero memory between computation
do:
# Main work
gemm(
rowMajor, noTranspose, noTranspose,
M, N, K,
1, a[0].unsafeaddr, K,
b[0].unsafeAddr, N,
0, output[0].addr, N
)
proc benchArraymancerFallback(a, b: seq[float32], nb_samples: int) =
var output = newSeq[float32](out_size)
bench("Arraymancer fallback BLAS"):
# Initialisation, not measured apart for the "Collected n samples in ... seconds"
zeroMem(output[0].addr, out_size) # We zero memory between computation
do:
# Main work
gemm_nn_fallback(
M, N, K,
1'f32, a, 0, K, 1, # offset, stride row, stride col
b, 0, N, 1,
0'f32, output, 0, N, 1
)
proc benchSimpleTiling(a, b: seq[float32], nb_samples: int) {.noinline.}=
var output = newSeq[float32](out_size)
let pa = a[0].unsafeAddr
let pb = b[0].unsafeAddr
let po = output[0].addr
const blck = 32
bench("Simple Tiling"):
# Initialisation, not measured apart for the "Collected n samples in ... seconds"
zeroMem(output[0].addr, out_size) # We zero memory between computation
do:
{.emit: """
#define min(a,b) (((a)<(b))?(a):(b))
float (* restrict A)[`K`] = (void*)`pa`;
float (* restrict B)[`N`] = (void*)`pb`;
float (* restrict C)[`N`] = (void*)`po`;
// TODO: where to parallelize?
for (int j = 0; j < `N`; j+=`blck`)
for (int k = 0; k < `K`; k+=`blck`)
for (int i = 0; i < `M`; i++)
for (int jj = j; jj<min(j+`blck`, `N`); jj++)
for (int kk = k; kk<min(k+`blck`, `K`); kk++)
C[i][jj] += A[i][kk] * B[kk][jj];
""".}
proc benchLaserGEMM(a, b: seq[float32], nb_samples: int) =
var output = newSeq[float32](out_size)
let a_ptr{.restrict.} = a[0].unsafeAddr
let b_ptr{.restrict.} = b[0].unsafeAddr
let c_ptr{.restrict.} = output[0].addr
bench("Laser production implementation"):
# Initialisation, not measured apart for the "Collected n samples in ... seconds"
zeroMem(output[0].addr, out_size) # We zero memory between computation
do:
# Main work
gemm_strided(
M, N, K,
1'f32, a_ptr, K, 1, # stride row, stride col
b_ptr, N, 1,
0'f32, c_ptr, N, 1
)
{.passC: "-I" & cSourcesPath & "pytorch_glow/".}
import pytorch_glow/libjit_matmul
# Hack due to conflicts between "-std=c++11" requires by Glow
# and incompatible with C files in cpuinfo.
# We can't use the proper:
# {.compile: "pytorch_glow/libjit_matmul.cpp".}
# {.passC: "-std=c++11 -mavx -mfma".}
# ^^^ This is configured in nim.cfg instead
proc libjit_matmul_f(
c, a, b: ptr float32,
cDims, aDims, bDims: ptr array[2, int]
) {.importc, cdecl.}
# Note: Matrix C will be zero-mem'ed by libjit
proc benchPyTorchGlow(a, b: seq[float32], nb_samples: int) =
var output = newSeq[float32](out_size)
let a_ptr{.restrict.} = a[0].unsafeAddr
let b_ptr{.restrict.} = b[0].unsafeAddr
let c_ptr{.restrict.} = output[0].addr
let cDims = [M, N]
let aDims = [M, K]
let bDims = [K, N]
let cDims_ptr{.restrict.} = cDims.unsafeAddr
let aDims_ptr{.restrict.} = aDims.unsafeAddr
let bDims_ptr{.restrict.} = bDims.unsafeAddr
bench("PyTorch Glow: libjit matmul implementation"):
discard # zeroMem done by libjit
do:
# Main work
libjit_matmul_f(
c_ptr, a_ptr, b_ptr,
cDims_ptr, aDims_ptr, bDims_ptr
)
# ###########################################
when defined(fast_math):
{.passC:"-ffast-math".}
when defined(march_native):
{.passC:"-march=native".}
when defined(openmp):
{.passC: "-fopenmp".}
{.passL: "-fopenmp".}
when isMainModule:
randomize(42) # For reproducibility
warmup()
echo ""
echo "A matrix shape: " & $ashape
echo "B matrix shape: " & $bshape
echo "Output shape: " & $out_shape
echo &"Required number of operations: {req_ops.float / float(10^6):>9.3f} millions"
echo &"Required bytes: {req_bytes.float / float(10^6):>9.3f} MB"
echo &"Arithmetic intensity: {req_ops.float / req_bytes.float:>9.3f} FLOP/byte"
echo &"Theoretical peak single-core: {CpuGhz * CpuFlopCycle:>9.3f} GFLOP/s"
echo &"Theoretical peak multi: {CpuGhz * CpuFlopCycle * NumCpuCores:>9.3f} GFLOP/s"
echo "Make sure to not bench Apple Accelerate or the default Linux BLAS."
block:
let a = newSeqWith(M*K, float32 rand(1.0))
let b = newSeqWith(K*N, float32 rand(1.0))
# when not defined(openmp):
# benchSimpleTiling(a, b, NbSamples) # for some reason stalled with OpenMP
# benchArraymancerFallback(a, b, NbSamples)
benchOpenBLAS(a, b, NbSamples)
benchLaserGEMM(a, b, NbSamples)
benchPyTorchGlow(a, b, NbSamples)
# Seems like my original Arraymancer BLAS has false sharing issue
# FYI Apple accelerate is about 117~122GFLOP/s on my machine.
###############################
# Compilation command
# $ nim cpp -r -d:release -d:openmp -o:build/bench_gemm benchmarks/gemm/gemm_bench_float32.nim
# Don't forget to add OpenBLAS in your path:
# For example on Mac with OpenBLAS from Homebrew
# `export LD_LIBRARY_PATH=/usr/local/opt/openblas/lib`
###############################
# OpenMP
# $ ./build/bench_gemm
# Warmup: 1.1900 s, result 224 (displayed to avoid compiler optimizing warmup away)
# A matrix shape: (M: 1920, N: 1920)
# B matrix shape: (M: 1920, N: 1920)
# Output shape: (M: 1920, N: 1920)
# Required number of operations: 14155.776 millions
# Required bytes: 29.491 MB
# Arithmetic intensity: 480.000 FLOP/byte
# Theoretical peak single-core: 86.400 GFLOP/s
# Theoretical peak multi: 172.800 GFLOP/s
# Make sure to not bench Apple Accelerate or the default Linux BLAS.
# OpenBLAS benchmark
# Collected 10 samples in 0.977 seconds
# Average time: 97.464 ms
# Stddev time: 1.479 ms
# Min time: 95.994 ms
# Max time: 101.073 ms
# Perf: 145.241 GFLOP/s
# Display output[0] to make sure it's not optimized away
# 470.7781372070312
# Laser production implementation
# Collected 10 samples in 1.039 seconds
# Average time: 103.659 ms
# Stddev time: 4.858 ms
# Min time: 97.393 ms
# Max time: 114.670 ms
# Perf: 136.561 GFLOP/s
# Display output[0] to make sure it's not optimized away
# 470.778076171875
###############################
# Serial - Nim code compiled without -d:openmp
# $ OPENBLAS_NUM_THREADS=1 ./build/bench_gemm
# Warmup: 1.1973 s, result 224 (displayed to avoid compiler optimizing warmup away)
# A matrix shape: (M: 1920, N: 1920)
# B matrix shape: (M: 1920, N: 1920)
# Output shape: (M: 1920, N: 1920)
# Required number of operations: 14155.776 millions
# Required bytes: 29.491 MB
# Arithmetic intensity: 480.000 FLOP/byte
# Theoretical peak single-core: 86.400 GFLOP/s
# Theoretical peak multi: 172.800 GFLOP/s
# Make sure to not bench Apple Accelerate or the default Linux BLAS.
# OpenBLAS benchmark
# Collected 10 samples in 1.893 seconds
# Average time: 189.041 ms
# Stddev time: 2.558 ms
# Min time: 186.120 ms
# Max time: 193.507 ms
# Perf: 74.882 GFLOP/s
# Display output[0] to make sure it's not optimized away
# 470.7781372070312
# Laser production implementation
# Collected 10 samples in 1.942 seconds
# Average time: 193.975 ms
# Stddev time: 4.279 ms
# Min time: 190.571 ms
# Max time: 205.327 ms
# Perf: 72.978 GFLOP/s
# Display output[0] to make sure it's not optimized away
# 470.778076171875
# PyTorch Glow: libjit matmul implementation
# Collected 10 samples in 2.216 seconds
# Average time: 221.567 ms
# Stddev time: 2.270 ms
# Min time: 218.499 ms
# Max time: 225.679 ms
# Perf: 63.889 GFLOP/s
# Display output[0] to make sure it's not optimized away
# 470.778076171875