In [1]:
using Pkg
Pkg.activate("/media/mat/HDD/AdaptiveTransportMap/")

[32m[1m Activating[22m[39m environment at `/media/mat/HDD/AdaptiveTransportMap/Project.toml`


In [4]:
using LinearAlgebra
using BenchmarkTools
using LoopVectorization
using Test

In [5]:
function f1!(d, A, B, c)
    d .= (A .* B) * c
end

function f2!(E, d, A, B, c)
    nx, ny = size(A)
    @avx for j = 1:ny
        for i = 1:nx
            E[i, j] = A[i, j] * B[i, j]
        end
    end
    mul!(d, E, c)
end

function f3!(d, A, B, c)
    nx, ny = size(A)
    @avx for i = 1:nx
        di = zero(eltype(d))
        for j = 1:ny
            di += (A[i, j] * B[i, j]) * c[j]
        end
        d[i] = di
    end
end
function f4!(d, A, B, c)
    @avx @. d = (A * B) *ˡ c # note super script l; denotes lazy multiplication that fuses with broadcasts
end
nx = 500; ny = 20;
A = randn(nx, ny);
B = randn(nx, ny);
c = randn(ny);
d1 = zeros(nx); d2 = similar(d1); d3 = similar(d1); d4 = similar(d1);
E = zeros(nx, ny);
@btime f1!($d1, $A, $B, $c);
@btime f2!($E, $d2, $A, $B, $c);
@btime f3!($d3, $A, $B, $c);
@btime f4!($d4, $A, $B, $c);
@test  d1 ≈ d2 ≈ d3 ≈ d4

  13.273 μs (3 allocations: 82.27 KiB)
  7.930 μs (0 allocations: 0 bytes)
  1.425 μs (0 allocations: 0 bytes)
  1.459 μs (0 allocations: 0 bytes)


[32m[1mTest Passed[22m[39m

In [5]:
zero(eltype(d1))

0.0

In [21]:
function f1!(d, A, B, c)
    d .= (A .* B)*c
end

function f2!(E, d, A, B, c)
    nx, ny = size(A)
    for j=1:ny
        for i=1:nx
            E[i,j] = A[i,j]*B[i,j]
        end
    end
    d .= E*c
end

function f3!(d, A, B, c)
    nx, ny = size(A)
    Ei = zeros(ny)
    @inbounds for k=1:nx
        for i=1:nx
            for j=1:ny
            Ei[j] = A[i,j]*B[i,j]
            end
            d[k] = Ei'*c
        end
    end
end

function timing()
@btime  begin
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
end

@btime begin 
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    f1!(d, A, B, c)
end
    
@btime begin
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    E = zeros(nx,ny)
    f2!(E, d, A, B, c)
end  
    
@btime begin
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    f3!(d, A, B, c)
end 
end

timing()

  88.810 μs (5 allocations: 156.64 KiB)
  105.616 μs (9 allocations: 242.97 KiB)
  117.366 μs (9 allocations: 242.97 KiB)
  6.218 ms (7 allocations: 160.94 KiB)


In [17]:
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    f3!(d, A, B, c)

In [11]:
BLAS.gemm!

gemm! (generic function with 4 methods)

In [19]:
randn(10)'*randn(10)

5.880387343498137