forked from JuliaGPU/CUDA.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
peakflops.jl
54 lines (42 loc) · 1.24 KB
/
peakflops.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
using CUDA
using Test
"Dummy kernel doing 100 FMAs."
function kernel_100fma(a, b, c, out)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
@inbounds a_val = a[i]
@inbounds b_val = b[i]
@inbounds c_val = c[i]
for j in 1:33
a_val = CUDA.fma(a_val, b_val, c_val)
b_val = CUDA.fma(a_val, b_val, c_val)
c_val = CUDA.fma(a_val, b_val, c_val)
end
@inbounds out[i] = CUDA.fma(a_val, b_val, c_val)
return
end
function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0))
device!(dev) do
dims = (n, n)
a = round.(rand(Float32, dims) * 100)
b = round.(rand(Float32, dims) * 100)
c = round.(rand(Float32, dims) * 100)
out = similar(a)
d_a = CuArray(a)
d_b = CuArray(b)
d_c = CuArray(c)
d_out = CuArray(out)
len = prod(dims)
threads = min(len, 1024)
blocks = len ÷ threads
# warm-up
@cuda kernel_100fma(d_a, d_b, d_c, d_out)
synchronize()
secs = CUDA.@elapsed begin
@cuda blocks=blocks threads=threads kernel_100fma(d_a, d_b, d_c, d_out)
end
flopcount = 200*len
flops = flopcount / secs
return flops
end
end
println(peakflops())