In [1]:
using Waveforms
using DifferentialEquations
using StaticArrays
using LinearAlgebra
using BenchmarkTools
using Distributions

┌ Info: Precompiling Waveforms [cb13b1c6-351e-5134-b3ad-d6a530956a82]
└ @ Base loading.jl:1278
┌ Info: Precompiling DifferentialEquations [0c46a032-eb83-5123-abaf-570d42b7fbaa]
└ @ Base loading.jl:1278
┌ Info: Precompiling BenchmarkTools [6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf]
└ @ Base loading.jl:1278


In [2]:
BLAS.vendor()

:mkl

In [3]:
const get_num_threads = function() # anonymous so it will be serialized when called
    blas = LinearAlgebra.BLAS.vendor()
    # Wrap in a try to catch unsupported blas versions
    try
        if blas == :openblas
            return ccall((:openblas_get_num_threads, Base.libblas_name), Cint, ())
        elseif blas == :openblas64
            return ccall((:openblas_get_num_threads64_, Base.libblas_name), Cint, ())
        elseif blas == :mkl
            return ccall((:MKL_Get_Max_Num_Threads, Base.libblas_name), Cint, ())
        end

        # OSX BLAS looks at an environment variable
        if Sys.isapple()
            return tryparse(Cint, get(ENV, "VECLIB_MAXIMUM_THREADS", "1"))
        end
    catch
    end

    return nothing
end

#1 (generic function with 1 method)

In [4]:
get_num_threads()

In [5]:
x = rand(5000,5000)
y = rand(5000,5000);

In [6]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  190.73 MiB
  allocs estimate:  2
  --------------
  minimum time:     466.005 ms (0.00% GC)
  median time:      516.769 ms (0.00% GC)
  mean time:        526.794 ms (3.32% GC)
  maximum time:     682.233 ms (14.36% GC)
  --------------
  samples:          10
  evals/sample:     1

In [7]:
z = zeros(5000,5000);

In [8]:
@benchmark mul!(z,x,y)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     441.218 ms (0.00% GC)
  median time:      487.216 ms (0.00% GC)
  mean time:        486.287 ms (0.00% GC)
  maximum time:     534.134 ms (0.00% GC)
  --------------
  samples:          11
  evals/sample:     1

## Static Arrays are faster but only recommended for use of arrays with less than 100 elements

In [9]:
x = @SMatrix rand(10,10)
y = @SMatrix rand(10,10);

In [10]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  816 bytes
  allocs estimate:  1
  --------------
  minimum time:     262.983 ns (0.00% GC)
  median time:      529.558 ns (0.00% GC)
  mean time:        572.633 ns (14.50% GC)
  maximum time:     52.927 μs (99.07% GC)
  --------------
  samples:          10000
  evals/sample:     362

In [11]:
x = rand(10,10)
y = rand(10,10);

In [12]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     337.904 ns (0.00% GC)
  median time:      642.466 ns (0.00% GC)
  mean time:        677.353 ns (13.40% GC)
  maximum time:     89.700 μs (99.17% GC)
  --------------
  samples:          10000
  evals/sample:     219

In [13]:
x = Hermitian(rand(39,39) + I)
y = Hermitian(rand(39,39));

In [14]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  24.13 KiB
  allocs estimate:  2
  --------------
  minimum time:     17.100 μs (0.00% GC)
  median time:      25.401 μs (0.00% GC)
  mean time:        30.110 μs (5.69% GC)
  maximum time:     17.172 ms (99.69% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [15]:
x = Matrix(Hermitian(rand(39,39) + I))
y = Matrix(Hermitian(rand(39,39)));

In [16]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     5.050 μs (0.00% GC)
  median time:      6.183 μs (0.00% GC)
  mean time:        7.396 μs (6.92% GC)
  maximum time:     2.910 ms (99.62% GC)
  --------------
  samples:          10000
  evals/sample:     6

In [17]:
x = rand(39,39)
y = rand(39,39);

In [18]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     5.000 μs (0.00% GC)
  median time:      5.217 μs (0.00% GC)
  mean time:        5.960 μs (1.34% GC)
  maximum time:     806.333 μs (98.77% GC)
  --------------
  samples:          10000
  evals/sample:     6

### Checking some other aspects of the Lindblad RHS

In [19]:
x = rand(ComplexF64, 39,39)
y = rand(ComplexF64, 39,39);

In [20]:
@benchmark -1im * (x*y-y*x)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     40.699 μs (0.00% GC)
  median time:      44.200 μs (0.00% GC)
  mean time:        53.716 μs (6.57% GC)
  maximum time:     14.998 ms (99.46% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [21]:
@benchmark -1im * (x*y-y*x) - 0.5*(y*x + x*y)

BenchmarkTools.Trial: 
  memory estimate:  215.02 KiB
  allocs estimate:  18
  --------------
  minimum time:     81.100 μs (0.00% GC)
  median time:      145.000 μs (0.00% GC)
  mean time:        154.582 μs (13.97% GC)
  maximum time:     29.071 ms (99.43% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [69]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [70]:
function multi_test(x::Array{Complex{Float64},2},y::Array{Complex{Float64},2})}
    -1im *(y*x-x*y) -0.5 * (y*x+x*y)
end

multi_test (generic function with 1 method)

In [71]:
@time multi_test(x,y);

MethodError: MethodError: no method matching multi_test(::Array{Complex{Float64},2}, ::Array{Complex{Float64},3})
Closest candidates are:
  multi_test(::Array{Complex{Float64},2}, !Matched::Array{Complex{Float64},2}) at In[70]:2

In [72]:
@benchmark multi_test(x,y)

MethodError: MethodError: no method matching multi_test(::Array{Complex{Float64},2}, ::Array{Complex{Float64},3})
Closest candidates are:
  multi_test(::Array{Complex{Float64},2}, !Matched::Array{Complex{Float64},2}) at In[70]:2

In [9]:
function sum_matrix(x,y)
    s = zeros(39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 1 method)

In [10]:
@benchmark sum_matrix(x,y)

LoadError: DimensionMismatch("A has dimensions (5000,1) but B has dimensions (5000,5000)")

In [11]:
x = rand(Float64,39,39)
y = rand(Float64,36,39,39);

In [12]:
function sum_matrix(x,y)
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 1 method)

In [13]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     328.700 μs (0.00% GC)
  median time:      425.401 μs (0.00% GC)
  mean time:        872.616 μs (17.28% GC)
  maximum time:     24.193 ms (96.43% GC)
  --------------
  samples:          5732
  evals/sample:     1

In [14]:
function sum_matrix(x::Array{Float64,2},y::Array{Float64,3})
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 2 methods)

In [15]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     325.200 μs (0.00% GC)
  median time:      409.601 μs (0.00% GC)
  mean time:        487.763 μs (13.10% GC)
  maximum time:     22.171 ms (97.36% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [16]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [17]:
function sum_matrix(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [18]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     946.600 μs (0.00% GC)
  median time:      1.373 ms (0.00% GC)
  mean time:        2.344 ms (16.96% GC)
  maximum time:     38.358 ms (90.97% GC)
  --------------
  samples:          2131
  evals/sample:     1

In [19]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [20]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     936.800 μs (0.00% GC)
  median time:      2.769 ms (0.00% GC)
  mean time:        3.069 ms (17.63% GC)
  maximum time:     38.225 ms (91.88% GC)
  --------------
  samples:          1628
  evals/sample:     1

In [21]:
function sum_matrix_inbounds_sim(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds @simd for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds_sim (generic function with 1 method)

In [22]:
@benchmark sum_matrix_inbounds_sim(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     954.299 μs (0.00% GC)
  median time:      2.759 ms (0.00% GC)
  mean time:        3.010 ms (17.80% GC)
  maximum time:     41.167 ms (94.58% GC)
  --------------
  samples:          1661
  evals/sample:     1

In [23]:
function sum_matrix_threads(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    Threads.@threads for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
    s
end

sum_matrix_threads (generic function with 1 method)

In [24]:
sum_matrix_threads(x,y);

In [25]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  369
  --------------
  minimum time:     957.200 μs (0.00% GC)
  median time:      2.769 ms (0.00% GC)
  mean time:        3.048 ms (18.04% GC)
  maximum time:     51.171 ms (94.37% GC)
  --------------
  samples:          1637
  evals/sample:     1

In [26]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  369
  --------------
  minimum time:     957.100 μs (0.00% GC)
  median time:      2.792 ms (0.00% GC)
  mean time:        3.097 ms (17.28% GC)
  maximum time:     54.905 ms (94.61% GC)
  --------------
  samples:          1611
  evals/sample:     1

In [27]:
@time sum_matrix_threads(x,y);

  0.003789 seconds (371 allocations: 4.224 MiB)


In [28]:
@time sum_matrix_threads(x,y);

  0.003713 seconds (369 allocations: 4.224 MiB)


In [29]:
@time sum_matrix_inbounds_sim(x,y)

  0.002935 seconds (362 allocations: 4.223 MiB)


In [30]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s .= s .+ y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [31]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  3.38 MiB
  allocs estimate:  290
  --------------
  minimum time:     856.600 μs (0.00% GC)
  median time:      2.346 ms (0.00% GC)
  mean time:        2.575 ms (17.42% GC)
  maximum time:     55.057 ms (95.79% GC)
  --------------
  samples:          1941
  evals/sample:     1

In [32]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        a = @view y[i,:,:]
        b = @view y[i,:,:]
        s .= s .+ a*x*b
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [33]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  1.73 MiB
  allocs estimate:  578
  --------------
  minimum time:     5.454 ms (0.00% GC)
  median time:      6.314 ms (0.00% GC)
  mean time:        6.492 ms (4.28% GC)
  maximum time:     61.148 ms (89.33% GC)
  --------------
  samples:          771
  evals/sample:     1

In [34]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:33
        a = @view C_array_const[i,:,:]
        b = @view C_conj_array_const[i,:,:]
        s .= s .+ a*u*b
    end
    -1im .* (Hamiltonian(t).*u .- u*Hamiltonian(t)) .- 0.5 * (Cprecalc_const*u .+ u*Cprecalc_const) .+ s
end

Lindblad_rhs (generic function with 1 method)

In [35]:
const Cprecalc_const = rand(ComplexF64,39,39)
const C_array_const = rand(ComplexF64,36,39,39)
const C_conj_array_const = rand(ComplexF64,36,39,39);

In [36]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39)*t
end

Hamiltonian (generic function with 1 method)

In [37]:
u = rand(ComplexF64,39,39);

In [38]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.82 MiB
  allocs estimate:  550
  --------------
  minimum time:     5.169 ms (0.00% GC)
  median time:      6.007 ms (0.00% GC)
  mean time:        6.678 ms (3.98% GC)
  maximum time:     62.727 ms (90.35% GC)
  --------------
  samples:          749
  evals/sample:     1

In [39]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        a = @view A[i,:,:]
        c = @view C[i,:,:]
        result .= result .+ a*B*c
    end
end

sum_mult! (generic function with 1 method)

In [40]:
s = zeros(ComplexF64,39,39);

In [41]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  1.56 MiB
  allocs estimate:  528
  --------------
  minimum time:     5.014 ms (0.00% GC)
  median time:      5.698 ms (0.00% GC)
  mean time:        5.930 ms (4.73% GC)
  maximum time:     68.740 ms (91.48% GC)
  --------------
  samples:          844
  evals/sample:     1

In [42]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        result .= result .+ A[i,:,:]*B*C[i,:,:]
    end
end

sum_mult! (generic function with 1 method)

In [43]:
s = zeros(ComplexF64,39,39);

In [44]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  3.08 MiB
  allocs estimate:  264
  --------------
  minimum time:     845.000 μs (0.00% GC)
  median time:      2.147 ms (0.00% GC)
  mean time:        2.435 ms (19.04% GC)
  maximum time:     62.073 ms (96.64% GC)
  --------------
  samples:          2050
  evals/sample:     1

In [45]:
size(C_array_const)[1]

36

In [46]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3},
                  idx_max::Int64,
                  shape::Array{Int64,1}
    )
    intermediate1 = zeros(ComplexF64, shape[1], shape[2])
    intermediate2 = zeros(ComplexF64, shape[1], shape[2])
    @inbounds for i = 1:idx_max
        mul!(intermediate1,B,C[i,:,:])
        mul!(intermediate2, A[i,:,:], intermediate1)
        result .+= intermediate2
    end
end

sum_mult! (generic function with 2 methods)

In [47]:
s = zeros(ComplexF64,39,39);

In [48]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const, 33,[39,39])

BenchmarkTools.Trial: 
  memory estimate:  1.59 MiB
  allocs estimate:  137
  --------------
  minimum time:     663.299 μs (0.00% GC)
  median time:      1.479 ms (0.00% GC)
  mean time:        1.772 ms (16.73% GC)
  maximum time:     81.012 ms (97.85% GC)
  --------------
  samples:          2818
  evals/sample:     1

In [49]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    sum_mult!(s,C_array_const,u,C_conj_array_const,33,[39,39])
    s .= s .- 1im .* Hamiltonian(t)*u
    s .= s .+ 1im .* Hamiltonian(t)*u
    s .= s .- 0.5 .* Cprecalc_const*u
    s .= s .- 0.5 .* u*Cprecalc_const
    return s
end

Lindblad_rhs (generic function with 1 method)

In [50]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.89 MiB
  allocs estimate:  163
  --------------
  minimum time:     768.000 μs (0.00% GC)
  median time:      1.697 ms (0.00% GC)
  mean time:        2.014 ms (16.32% GC)
  maximum time:     70.252 ms (96.75% GC)
  --------------
  samples:          2485
  evals/sample:     1

In [51]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    intermediate = zeros(ComplexF64,39,39)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const,33,[39,39])
    mul!(intermediate, Hamiltonian(t), u)
    du .-= 1im .* intermediate
    mul!(intermediate, u, Hamiltonian(t))
    du .+= 1im .* intermediate
    mul!(intermediate, Cprecalc_const, u)
    du .-= 0.5 .* intermediate
    mul!(intermediate, u, Cprecalc_const)
    du .-= 0.5 .* intermediate
end

Lindblad_rhs! (generic function with 1 method)

In [52]:
du = rand(ComplexF64,39,39);

In [53]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     737.900 μs (0.00% GC)
  median time:      1.599 ms (0.00% GC)
  mean time:        1.853 ms (13.72% GC)
  maximum time:     73.833 ms (97.81% GC)
  --------------
  samples:          2694
  evals/sample:     1

In [54]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     726.100 μs (0.00% GC)
  median time:      1.538 ms (0.00% GC)
  mean time:        1.738 ms (16.15% GC)
  maximum time:     82.134 ms (98.05% GC)
  --------------
  samples:          2876
  evals/sample:     1

In [55]:
function square_wave(t::Float64, frequency::Float64, phase::Float64)
    0.5.*(1+squarewave(2*pi.*frequency.*t .+ phase))
end

square_wave (generic function with 1 method)

In [56]:
@benchmark square_wave(1e-6, 1/(2*2e-6), 0.0)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.099 ns (0.00% GC)
  median time:      3.100 ns (0.00% GC)
  mean time:        3.135 ns (0.00% GC)
  maximum time:     20.201 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

In [57]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39).*square_wave(t, 1/(2*2e-6), 0.0)
end

Hamiltonian (generic function with 1 method)

In [58]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    intermediate = zeros(ComplexF64,39,39)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const,33,[39,39])
    mul!(intermediate, Hamiltonian(t), u)
    du .-= 1im .* intermediate
    mul!(intermediate, u, Hamiltonian(t))
    du .+= 1im .* intermediate
    mul!(intermediate, Cprecalc_const, u)
    du .-= 0.5 .* intermediate
    mul!(intermediate, u, Cprecalc_const)
    du .-= 0.5 .* intermediate
end

Lindblad_rhs! (generic function with 1 method)

In [59]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     732.600 μs (0.00% GC)
  median time:      1.542 ms (0.00% GC)
  mean time:        1.789 ms (17.05% GC)
  maximum time:     93.140 ms (98.32% GC)
  --------------
  samples:          2788
  evals/sample:     1

Still about a factor 1.5 to 2 slower than the numpy implementation.