In [1]:
using Pkg
Pkg.activate("/media/mat/HDD/AdaptiveTransportMap/")

[32m[1m Activating[22m[39m environment at `/media/mat/HDD/AdaptiveTransportMap/Project.toml`


In [2]:
using LinearAlgebra
using BenchmarkTools
using LoopVectorization
using Test

In [11]:
import Base: @propagate_inbounds

abstract type ParamFcn end

struct constant <:ParamFcn
end

struct linear <:ParamFcn
end

struct rbf <:ParamFcn
        μ::Float64
        σ::Float64
end

struct Basis{m}
    f::Array{ParamFcn,1}
    function Basis(f::Array{ParamFcn,1})
        return new{size(f,1)}(f)
    end
end


Base.size(B::Basis{m}) where {m} = m
@propagate_inbounds Base.getindex(B::Basis{m}, i::Int) where {m} = getindex(B.f,i)
@propagate_inbounds Base.setindex!(B::Basis{m}, v::ParamFcn, i::Int) where {m} = setindex!(B.f,v,i)

B = Basis([constant(); linear(); rbf(1.0, 1.0)])
# @code_warntype 
@code_warntype B[2]

Variables
  #self#[36m::Core.Compiler.Const(getindex, false)[39m
  B[36m::Basis{3}[39m
  i[36m::Int64[39m

Body[91m[1m::ParamFcn[22m[39m
[90m1 ─[39m      nothing
[90m│  [39m %2 = Base.getproperty(B, :f)[36m::Array{ParamFcn,1}[39m
[90m│  [39m %3 = Main.getindex(%2, i)[91m[1m::ParamFcn[22m[39m
[90m└──[39m      return %3


In [14]:
x = Real[1, 2.0, 3//1]
@code_warntype x[2] # Body::Real

Variables
  #self#[36m::Core.Compiler.Const(getindex, false)[39m
  A[36m::Array{Real,1}[39m
  i1[36m::Int64[39m

Body[91m[1m::Real[22m[39m
[90m1 ─[39m %1 = Base.arrayref($(Expr(:boundscheck)), A, i1)[91m[1m::Real[22m[39m
[90m└──[39m      return %1


Variables
  #self#[36m::Core.Compiler.Const(getindex, false)[39m
  B[36m::Basis{3}[39m
  i[36m::Int64[39m

Body[91m[1m::ParamFcn[22m[39m
[90m1 ─[39m      nothing
[90m│  [39m %2 = Base.getproperty(B, :f)[36m::Array{ParamFcn,1}[39m
[90m│  [39m %3 = Main.getindex(%2, i)[91m[1m::ParamFcn[22m[39m
[90m└──[39m      return %3


In [3]:
struct Element
    a::Float64
end


struct WrapElement
    e::Element
end
struct basis{m}
    A::Array{WrapElement,1}
end

Base.getindex(B::basis{m}, i::Int) where {m} = getindex(B.A,i)
Base.setindex!(B::basis{m}, v::element, i::Int) where {m} = setindex!(B.A,v,i)

UndefVarError: UndefVarError: element not defined

In [5]:
B = basis{10}(map(i->element(randn()),1:10))

basis{10}(element[element(0.19030348165349717), element(-1.6687101460989597), element(-0.4936488017928584), element(-1.1338973726510793), element(-0.9504540411413008), element(-1.7074663065573439), element(0.48446025066380866), element(-1.5339779949082857), element(-0.548060137109632), element(-1.9519771824055256)])

In [6]:
@code_warntype B[5]

Variables
  #self#[36m::Core.Compiler.Const(getindex, false)[39m
  B[36m::basis{10}[39m
  i[36m::Int64[39m

Body[36m::element[39m
[90m1 ─[39m %1 = Base.getproperty(B, :A)[36m::Array{element,1}[39m
[90m│  [39m %2 = Main.getindex(%1, i)[36m::element[39m
[90m└──[39m      return %2


In [5]:
function f1!(d, A, B, c)
    d .= (A .* B) * c
end

function f2!(E, d, A, B, c)
    nx, ny = size(A)
    @avx for j = 1:ny
        for i = 1:nx
            E[i, j] = A[i, j] * B[i, j]
        end
    end
    mul!(d, E, c)
end

function f3!(d, A, B, c)
    nx, ny = size(A)
    @avx for i = 1:nx
        di = zero(eltype(d))
        for j = 1:ny
            di += (A[i, j] * B[i, j]) * c[j]
        end
        d[i] = di
    end
end
function f4!(d, A, B, c)
    @avx @. d = (A * B) *ˡ c # note super script l; denotes lazy multiplication that fuses with broadcasts
end
nx = 500; ny = 20;
A = randn(nx, ny);
B = randn(nx, ny);
c = randn(ny);
d1 = zeros(nx); d2 = similar(d1); d3 = similar(d1); d4 = similar(d1);
E = zeros(nx, ny);
@btime f1!($d1, $A, $B, $c);
@btime f2!($E, $d2, $A, $B, $c);
@btime f3!($d3, $A, $B, $c);
@btime f4!($d4, $A, $B, $c);
@test  d1 ≈ d2 ≈ d3 ≈ d4

  13.273 μs (3 allocations: 82.27 KiB)
  7.930 μs (0 allocations: 0 bytes)
  1.425 μs (0 allocations: 0 bytes)
  1.459 μs (0 allocations: 0 bytes)


[32m[1mTest Passed[22m[39m

In [5]:
zero(eltype(d1))

0.0

In [21]:
function f1!(d, A, B, c)
    d .= (A .* B)*c
end

function f2!(E, d, A, B, c)
    nx, ny = size(A)
    for j=1:ny
        for i=1:nx
            E[i,j] = A[i,j]*B[i,j]
        end
    end
    d .= E*c
end

function f3!(d, A, B, c)
    nx, ny = size(A)
    Ei = zeros(ny)
    @inbounds for k=1:nx
        for i=1:nx
            for j=1:ny
            Ei[j] = A[i,j]*B[i,j]
            end
            d[k] = Ei'*c
        end
    end
end

function timing()
@btime  begin
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
end

@btime begin 
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    f1!(d, A, B, c)
end
    
@btime begin
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    E = zeros(nx,ny)
    f2!(E, d, A, B, c)
end  
    
@btime begin
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    f3!(d, A, B, c)
end 
end

timing()

  88.810 μs (5 allocations: 156.64 KiB)
  105.616 μs (9 allocations: 242.97 KiB)
  117.366 μs (9 allocations: 242.97 KiB)
  6.218 ms (7 allocations: 160.94 KiB)


In [17]:
    nx = 500
    ny = 20
    A = randn(nx, ny)
    B = randn(nx, ny)
    c = randn(ny);
    d = zeros(nx)
    f3!(d, A, B, c)

In [11]:
BLAS.gemm!

gemm! (generic function with 4 methods)

In [19]:
randn(10)'*randn(10)

5.880387343498137