/
weave_transposes.nim
299 lines (250 loc) · 10.7 KB
/
weave_transposes.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Original transposition codes from Laser project
# (c) Mamy André Ratsimbazafy, Apache License version 2
import
# Stdlib
strformat, os, strutils, math, system/ansi_c,
cpuinfo, streams, strscans,
# Third-party
cligen,
# Weave
../../weave
when not defined(windows):
# bench
import ../wtime, ../resources
# Memory
# ---------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
## Default allocator for the Picasso library
## This allocates memory to hold the type T
## and returns a pointer to it
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
createSharedU(T)
else:
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
## Default allocator for the Picasso library.
## This allocates a contiguous chunk of memory
## to hold ``len`` elements of type T
## and returns a pointer to it.
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
when defined(WV_useNimAlloc):
freeShared(p)
else:
c_free(p)
# Transpose implementations
# ---------------------------------------------------
type TransposeStrategy = enum
Sequential
Naive
Nested
TiledNested
# Question: do we need __restrict to avoid the compiler generating
# defensive aliasing robust code?
proc sequentialTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
for j in 0 ..< N:
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc weaveNaiveTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix
# Write are more expensive than read so we keep i accesses linear for writes
parallelFor j in 0 ..< N:
captures: {M, N, bufIn, bufOut}
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc weaveNestedTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix with nested for loops
parallelFor j in 0 ..< N:
captures: {M, N, bufIn, bufOut}
parallelFor i in 0 ..< M:
captures: {j, M, N, bufIn, bufOut}
bufOut[j*M+i] = bufIn[i*N+j]
proc weave2DTiledNestedTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose with 2D tiling and nested
const blck = 64 # const do not need to be captured
parallelForStrided j in 0 ..< N, stride = blck:
captures: {M, N, bufIn, bufOut}
parallelForStrided i in 0 ..< M, stride = blck:
captures: {j, M, N, bufIn, bufOut}
for jj in j ..< min(j+blck, N):
for ii in i ..< min(i+blck, M):
bufOut[jj*M+ii] = bufIn[ii*N+jj]
# Meta
# ---------------------------------------------------
func computeMeta(height, width: int): tuple[reqOps, reqBytes, bufSize: int] =
result.reqOps = height * width
result.reqBytes = sizeof(float32) * height * width
result.bufSize = height * width
func initialize(buffer: ptr UncheckedArray[float32], len: int) =
for i in 0 ..< len:
buffer[i] = i.float32
# Bench
# ---------------------------------------------------
template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
var maxRSS, runtimeRSS, pageFaults: int32
block:
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss
pageFaults = ru.ru_minflt
body
when not defined(windows):
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss - runtimeRSS
pageFaults = ru.ru_minflt - pageFaults
maxRss = ru.ru_maxrss
proc report(
M, N: int, nthreads: int32, nrounds: int, reordered: bool,
transposeStrategy: TransposeStrategy, reqOps, reqBytes: int,
mxnTime: float64, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults: int32,
nxmTime: float64, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults: int32,
) =
let arithIntensity = reqOps.float / reqBytes.float
let mxnPerf = reqOps.float/(mxnTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
let nxmPerf = reqOps.float/(nxmTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
const lazy = defined(WV_LazyFlowvar)
const config = if lazy: " (lazy flowvars)"
else: " (eager flowvars)"
echo "--------------------------------------------------------------------------"
echo "Scheduler: Weave ", config
echo "Benchmark: Transpose - ", $transposeStrategy
echo "Threads: ", nthreads
echo "# of rounds: ", nrounds
echo "# of operations: ", reqOps
echo "# of bytes: ", reqBytes
echo "Arithmetic Intensity: ", round(arithIntensity, 3)
echo "--------------------------------------------------------------------------"
if not reordered:
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
when not defined(windows):
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
when not defined(windows):
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
else:
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
when not defined(windows):
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
when not defined(windows):
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
template runBench(transposeName: typed, reorderCompute, isSequential: bool): untyped =
if not reorderCompute:
if not isSequential:
init(Weave)
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(M, N, bufIn, bufOut)
if not isSequential:
syncRoot(Weave)
when not defined(windows):
let stop = wtime_msec()
mxnTime = stop - start
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(N, M, bufIn, bufOut)
if not isSequential:
syncRoot(Weave)
when not defined(windows):
let stop = wtime_msec()
nxmTime = stop - start
if not isSequential:
exit(Weave)
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
else:
if not isSequential:
init(Weave)
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(N, M, bufIn, bufOut)
if not isSequential:
syncRoot(Weave)
when not defined(windows):
let stop = wtime_msec()
nxmTime = stop - start
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(M, N, bufIn, bufOut)
if not isSequential:
syncRoot(Weave)
when not defined(windows):
let stop = wtime_msec()
mxnTime = stop - start
if not isSequential:
exit(Weave)
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
# Interface
# ---------------------------------------------------
proc main(M = 400, N = 4000, nrounds = 1000, transposeStrat = TiledNested, reorderCompute=false) =
echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
let isSequential = transposeStrat == Sequential
var nthreads: int32
if transposeStrat == Sequential:
nthreads = 1
elif existsEnv"WEAVE_NUM_THREADS":
nthreads = getEnv"WEAVE_NUM_THREADS".parseInt().int32
else:
nthreads = countProcessors().int32
let (reqOps, reqBytes, bufSize) = computeMeta(M, N)
let bufOut = wv_alloc(float32, bufSize)
let bufIn = wv_alloc(float32, bufSize)
bufIn.initialize(bufSize)
var mxnTime, nxmTime: float64
case transposeStrat
of Sequential: runBench(sequentialTranspose, reorderCompute, isSequential)
of Naive: runBench(weaveNaiveTranspose, reorderCompute, isSequential)
of Nested: runBench(weaveNestedTranspose, reorderCompute, isSequential)
of TiledNested: runBench(weave2DTiledNestedTranspose, reorderCompute, isSequential)
wv_free(bufOut)
wv_free(bufIn)
dispatch(main)