In [1]:
using BenchmarkTools

In [2]:
using CUDA

In [3]:
using LoopVectorization

In [4]:
Threads.nthreads()

8

In [5]:
N = 4096

4096

In [6]:
A = rand(Float32, N,N);
B = rand(Float32, N,N);

In [7]:
A64 = rand(N,N);
B64 = rand(N,N);

In [8]:
cA = CuArray(A);
cB = CuArray(B);

In [9]:
function mymult(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    size(A, 2) == size(B, 1) || error("inner dims must match")
    C = zeros(T, size(A,1), size(B,2))
    for n=1:size(B, 2), k=1:size(A,2), m=1:size(A, 1)
        @inbounds C[m,n] += A[m,k]*B[k,n]
    end
    C
end

mymult (generic function with 1 method)

In [10]:
function mymult(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    size(A, 2) == size(B, 1) || error("inner dims must match")
    C = zeros(T, size(A,1), size(B,2))
    for n=1:size(B, 2), k=1:size(A,2), m=1:size(A, 1)
        @inbounds C[m,n] += A[m,k]*B[k,n]
    end
    C
end

mymult (generic function with 1 method)

In [11]:
function mymult_avx(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    size(A, 2) == size(B, 1) || error("inner dims must match")
    C = zeros(T, size(A,1), size(B,2))
    @avx for n=1:size(B, 2), k=1:size(A,2), m=1:size(A, 1)
        C[m,n] += A[m,k]*B[k,n]
    end
    C
end

mymult_avx (generic function with 1 method)

In [12]:
function mymult_treaded(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    size(A, 2) == size(B, 1) || error("inner dims must match")
    C = zeros(T, size(A,1), size(B,2))
    Threads.@threads for n=1:size(B, 2)
        for k=1:size(A,2), m=1:size(A, 1)
            @inbounds C[m,n] += A[m,k]*B[k,n]
        end
    end
    C
end

mymult_treaded (generic function with 1 method)

In [13]:
@btime $A*$B; # OpenBlas

  683.623 ms (2 allocations: 64.00 MiB)


In [14]:
@btime mymult($A,$B); # naive Julia implementation

  19.513 s (2 allocations: 64.00 MiB)


In [15]:
@btime mymult_avx($A,$B); # speed-up using LoopVectorization

  10.199 s (2 allocations: 64.00 MiB)


In [16]:
@btime mymult_treaded($A,$B); # naive Julia multi-threaded

  9.845 s (57 allocations: 64.01 MiB)


In [17]:
@btime $cA*$cB; # GPU

  2.450 μs (5 allocations: 288 bytes)


In [18]:
@btime $A64*$B64; # OpenBlas

  2.147 s (2 allocations: 128.00 MiB)
