-
Notifications
You must be signed in to change notification settings - Fork 3
/
peakflops_gpu_matmul.jl
139 lines (128 loc) · 4.54 KB
/
peakflops_gpu_matmul.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
peakflops_gpu_matmul_scaling(peakflops_func = peakflops_gpu_matmul; verbose=true) -> sizes, flops
Asserts the scaling of the given `peakflops_func`tion (defaults to [`peakflops_gpu_matmul`](@ref))
with increasing matrix size. If `verbose=true` (default), displays a unicode plot. Returns
the considered sizes and TFLOP/s. For further options, see [`peakflops_gpu_matmul`](@ref).
"""
function peakflops_gpu_matmul_scaling(
peakflops_func::F=peakflops_gpu_matmul;
device=CUDA.device(),
verbose=true,
sizes=2 .^ (10:15),
io::IO=stdout,
kwargs...,
) where {F}
flops = zeros(length(sizes))
for (i, s) in enumerate(sizes)
flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...)
clear_gpu_memory(device)
end
if verbose
peak_val, idx = findmax(flops)
peak_size = sizes[idx]
p = UnicodePlots.lineplot(
sizes,
flops;
xlabel="matrix size",
ylabel="TFLOP/s",
title=string(
"Peak: ", round(peak_val; digits=2), " TFLOP/s (size = $(peak_size))"
),
xscale=:log2,
)
UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
println(io) # top margin
show(io, "text/plain", p)
println(io) # bottom margin
println(io) # bottom margin
end
return sizes, flops
end
_flopcount_per_matmul(n) = Float64(n)^3
"""
peakflops_gpu_matmul(; device, dtype=Float32, size=2^14, nmatmuls=5, nbench=5, verbose=true)
Tries to estimate the peak performance of a GPU in TFLOP/s by measuring the time
it takes to perform `nmatmuls` many (in-place) matrix-matrix multiplications.
**Keyword arguments:**
* `device` (default: `CUDA.device()`): CUDA device to be used.
* `dtype` (default: `Float32`): element type of the matrices.
* `size` (default: `2^14`): matrices will have dimensions `(size, size)`.
* `nmatmuls` (default: `5`): number of matmuls that will make up the kernel to be timed.
* `nbench` (default: `5`): number of measurements to be performed the best of which is used for the TFLOP/s computation.
* `verbose` (default: `true`): toggle printing.
* `io` (default: `stdout`): set the stream where the results should be printed.
See also: [`peakflops_gpu_matmul_scaling`](@ref), [`peakflops_gpu_matmul_graphs`](@ref).
"""
function peakflops_gpu_matmul(;
device=CUDA.device(),
dtype=Float32,
size=2^14,
nmatmuls=5,
nbench=5,
verbose=true,
io::IO=stdout,
)
device!(device) do
C = CUDA.zeros(dtype, size, size)
A = CUDA.rand(dtype, size, size)
B = CUDA.rand(dtype, size, size)
CUDA.@elapsed mul!(C, A, B) # warmup
t = Inf
NVTX.@range "peakflops_gpu: bench loop" begin
for i in 1:nbench
NVTX.@range "peakflops_gpu: kernel" begin
Δt = CUDA.@elapsed for _ in 1:nmatmuls
mul!(C, A, B)
# cublasGemmEx_wrapper!('N','N',A,B,C)
end
end
t = min(t, Δt)
end
end
flops = (_flopcount_per_matmul(size) * nmatmuls * 1e-12) / t
if verbose
printstyled(io, "Peakflops (TFLOP/s):\n"; bold=true)
print(io, " └ max: ")
printstyled(io, round(flops; digits=2), "\n"; color=:green, bold=true)
end
return flops
end
end
"""
Same as [`peakflops_gpu_matmul`](@ref) but uses CUDA's graph API to define and launch the kernel.
See also: [`peakflops_gpu_matmul_scaling`](@ref).
"""
function peakflops_gpu_matmul_graphs(;
device=CUDA.device(),
dtype=Float32,
size=2^14,
nmatmuls=5,
nbench=5,
verbose=true,
io::IO=stdout,
)
device!(device) do
C = CUDA.zeros(dtype, size, size)
A = CUDA.rand(dtype, size, size)
B = CUDA.rand(dtype, size, size)
t = Inf
for i in 1:nbench
graph = CUDA.capture() do
for _ in 1:nmatmuls
mul!(C, A, B)
# cublasGemmEx_wrapper!('N','N',A,B,C)
end
end
exec = instantiate(graph)
Δt = CUDA.@elapsed CUDA.launch(exec)
t = min(t, Δt)
end
flops = (_flopcount_per_matmul(size) * nmatmuls * 1e-12) / t
if verbose
printstyled(io, "Peakflops (TFLOP/s):\n"; bold=true)
print(io, " └ max: ")
printstyled(io, round(flops; digits=2), "\n"; color=:green, bold=true)
end
return flops
end
end