In [1]:
using Waveforms
using DifferentialEquations
using StaticArrays
using LinearAlgebra
using BenchmarkTools
using Distributions

┌ Info: Precompiling Waveforms [cb13b1c6-351e-5134-b3ad-d6a530956a82]
└ @ Base loading.jl:1260
┌ Info: Precompiling DifferentialEquations [0c46a032-eb83-5123-abaf-570d42b7fbaa]
└ @ Base loading.jl:1260
┌ Info: Precompiling BenchmarkTools [6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf]
└ @ Base loading.jl:1260


In [2]:
BLAS.vendor()

:mkl

In [3]:
const get_num_threads = function() # anonymous so it will be serialized when called
    blas = LinearAlgebra.BLAS.vendor()
    # Wrap in a try to catch unsupported blas versions
    try
        if blas == :openblas
            return ccall((:openblas_get_num_threads, Base.libblas_name), Cint, ())
        elseif blas == :openblas64
            return ccall((:openblas_get_num_threads64_, Base.libblas_name), Cint, ())
        elseif blas == :mkl
            return ccall((:MKL_Get_Max_Num_Threads, Base.libblas_name), Cint, ())
        end

        # OSX BLAS looks at an environment variable
        if Sys.isapple()
            return tryparse(Cint, get(ENV, "VECLIB_MAXIMUM_THREADS", "1"))
        end
    catch
    end

    return nothing
end

#3 (generic function with 1 method)

In [4]:
get_num_threads()

In [5]:
x = rand(5000,5000)
y = rand(5000,5000);

In [6]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  190.73 MiB
  allocs estimate:  2
  --------------
  minimum time:     786.948 ms (0.00% GC)
  median time:      1.141 s (0.00% GC)
  mean time:        1.050 s (0.64% GC)
  maximum time:     1.253 s (2.66% GC)
  --------------
  samples:          5
  evals/sample:     1

In [7]:
z = zeros(5000,5000);

In [8]:
@benchmark mul!(z,x,y)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     782.181 ms (0.00% GC)
  median time:      873.955 ms (0.00% GC)
  mean time:        855.442 ms (0.00% GC)
  maximum time:     882.185 ms (0.00% GC)
  --------------
  samples:          6
  evals/sample:     1

## Static Arrays are faster but only recommended for use of arrays with less than 100 elements

In [9]:
x = @SMatrix rand(10,10)
y = @SMatrix rand(10,10);

In [10]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  816 bytes
  allocs estimate:  1
  --------------
  minimum time:     257.402 ns (0.00% GC)
  median time:      521.752 ns (0.00% GC)
  mean time:        549.451 ns (15.97% GC)
  maximum time:     55.322 μs (98.85% GC)
  --------------
  samples:          10000
  evals/sample:     331

In [11]:
x = rand(10,10)
y = rand(10,10);

In [12]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     339.265 ns (0.00% GC)
  median time:      636.986 ns (0.00% GC)
  mean time:        665.904 ns (13.68% GC)
  maximum time:     82.200 μs (99.09% GC)
  --------------
  samples:          10000
  evals/sample:     219

In [13]:
x = Hermitian(rand(39,39) + I)
y = Hermitian(rand(39,39));

In [14]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  24.13 KiB
  allocs estimate:  2
  --------------
  minimum time:     6.700 μs (0.00% GC)
  median time:      15.650 μs (0.00% GC)
  mean time:        18.478 μs (7.34% GC)
  maximum time:     4.788 ms (99.51% GC)
  --------------
  samples:          10000
  evals/sample:     4

In [15]:
x = Matrix(Hermitian(rand(39,39) + I))
y = Matrix(Hermitian(rand(39,39)));

In [16]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     3.237 μs (0.00% GC)
  median time:      4.188 μs (0.00% GC)
  mean time:        5.720 μs (8.82% GC)
  maximum time:     2.397 ms (99.58% GC)
  --------------
  samples:          10000
  evals/sample:     8

In [17]:
x = rand(39,39)
y = rand(39,39);

In [18]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     3.225 μs (0.00% GC)
  median time:      4.000 μs (0.00% GC)
  mean time:        5.002 μs (5.57% GC)
  maximum time:     739.300 μs (98.70% GC)
  --------------
  samples:          10000
  evals/sample:     8

### Checking some other aspects of the Lindblad RHS

In [19]:
x = rand(ComplexF64, 39,39)
y = rand(ComplexF64, 39,39);

In [20]:
@benchmark -1im * (x*y-y*x)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     13.200 μs (0.00% GC)
  median time:      40.400 μs (0.00% GC)
  mean time:        48.861 μs (24.42% GC)
  maximum time:     24.981 ms (99.60% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [21]:
@benchmark -1im * (x*y-y*x) - 0.5*(y*x + x*y)

BenchmarkTools.Trial: 
  memory estimate:  215.02 KiB
  allocs estimate:  18
  --------------
  minimum time:     28.700 μs (0.00% GC)
  median time:      95.200 μs (0.00% GC)
  mean time:        132.112 μs (22.43% GC)
  maximum time:     38.482 ms (99.61% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [22]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,39,39);

In [78]:
function multi_test(x::Array{Complex{Float64},2},y::Array{Complex{Float64},2})}
    -1im *(y*x-x*y) -0.5 * (y*x+x*y)
end

LoadError: syntax: unexpected "}"

In [24]:
@time multi_test(x,y);

UndefVarError: UndefVarError: multi_test not defined

In [25]:
@benchmark multi_test(x,y)

UndefVarError: UndefVarError: multi_test not defined

In [26]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [27]:
function sum_matrix(x,y)
    s = zeros(39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 1 method)

In [28]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.21 MiB
  allocs estimate:  361
  --------------
  minimum time:     631.800 μs (0.00% GC)
  median time:      2.078 ms (0.00% GC)
  mean time:        2.489 ms (22.83% GC)
  maximum time:     44.272 ms (94.70% GC)
  --------------
  samples:          2003
  evals/sample:     1

In [29]:
x = rand(Float64,39,39)
y = rand(Float64,36,39,39);

In [30]:
function sum_matrix(x,y)
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 1 method)

In [31]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     407.900 μs (0.00% GC)
  median time:      509.700 μs (0.00% GC)
  mean time:        706.934 μs (12.75% GC)
  maximum time:     25.934 ms (96.34% GC)
  --------------
  samples:          7053
  evals/sample:     1

In [32]:
function sum_matrix(x::Array{Float64,2},y::Array{Float64,3})
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 2 methods)

In [33]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     428.401 μs (0.00% GC)
  median time:      523.250 μs (0.00% GC)
  mean time:        633.673 μs (10.80% GC)
  maximum time:     25.173 ms (96.17% GC)
  --------------
  samples:          7866
  evals/sample:     1

In [34]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [35]:
function sum_matrix(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [36]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     661.200 μs (0.00% GC)
  median time:      1.021 ms (0.00% GC)
  mean time:        1.572 ms (16.43% GC)
  maximum time:     44.661 ms (96.03% GC)
  --------------
  samples:          3173
  evals/sample:     1

In [37]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [38]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     659.000 μs (0.00% GC)
  median time:      1.988 ms (0.00% GC)
  mean time:        2.214 ms (19.55% GC)
  maximum time:     40.473 ms (95.01% GC)
  --------------
  samples:          2253
  evals/sample:     1

In [39]:
function sum_matrix_inbounds_sim(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds @simd for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds_sim (generic function with 1 method)

In [40]:
@benchmark sum_matrix_inbounds_sim(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     704.600 μs (0.00% GC)
  median time:      2.084 ms (0.00% GC)
  mean time:        2.508 ms (22.63% GC)
  maximum time:     56.761 ms (96.71% GC)
  --------------
  samples:          1988
  evals/sample:     1

In [41]:
function sum_matrix_threads(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    Threads.@threads for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
    s
end

sum_matrix_threads (generic function with 1 method)

In [42]:
sum_matrix_threads(x,y);

In [43]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.23 MiB
  allocs estimate:  396
  --------------
  minimum time:     594.200 μs (0.00% GC)
  median time:      1.907 ms (0.00% GC)
  mean time:        36.671 ms (94.86% GC)
  maximum time:     12.039 s (99.98% GC)
  --------------
  samples:          346
  evals/sample:     1

In [44]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.23 MiB
  allocs estimate:  396
  --------------
  minimum time:     569.400 μs (0.00% GC)
  median time:      1.934 ms (0.00% GC)
  mean time:        37.321 ms (94.95% GC)
  maximum time:     12.404 s (99.98% GC)
  --------------
  samples:          350
  evals/sample:     1

In [45]:
@time sum_matrix_threads(x,y);

  0.001147 seconds (397 allocations: 4.228 MiB)


In [46]:
@time sum_matrix_threads(x,y);

  0.001083 seconds (398 allocations: 4.228 MiB)


In [47]:
@time sum_matrix_inbounds_sim(x,y)

  0.002887 seconds (362 allocations: 4.223 MiB)


In [48]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s .= s .+ y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [49]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  3.38 MiB
  allocs estimate:  290
  --------------
  minimum time:     629.500 μs (0.00% GC)
  median time:      1.719 ms (0.00% GC)
  mean time:        1.957 ms (15.74% GC)
  maximum time:     46.789 ms (95.90% GC)
  --------------
  samples:          2546
  evals/sample:     1

In [50]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        a = @view y[i,:,:]
        b = @view y[i,:,:]
        s .= s .+ a*x*b
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [51]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  1.73 MiB
  allocs estimate:  650
  --------------
  minimum time:     4.221 ms (0.00% GC)
  median time:      4.806 ms (0.00% GC)
  mean time:        4.777 ms (2.57% GC)
  maximum time:     32.743 ms (84.65% GC)
  --------------
  samples:          1046
  evals/sample:     1

In [52]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:33
        a = @view C_array_const[i,:,:]
        b = @view C_conj_array_const[i,:,:]
        s .= s .+ a*u*b
    end
    -1im .* (Hamiltonian(t).*u .- u*Hamiltonian(t)) .- 0.5 * (Cprecalc_const*u .+ u*Cprecalc_const) .+ s
end

Lindblad_rhs (generic function with 1 method)

In [53]:
const Cprecalc_const = rand(ComplexF64,39,39)
const C_array_const = rand(ComplexF64,36,39,39)
const C_conj_array_const = rand(ComplexF64,36,39,39);

In [54]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39)*t
end

Hamiltonian (generic function with 1 method)

In [55]:
u = rand(ComplexF64,39,39);

In [56]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.82 MiB
  allocs estimate:  616
  --------------
  minimum time:     3.997 ms (0.00% GC)
  median time:      4.893 ms (0.00% GC)
  mean time:        6.365 ms (2.91% GC)
  maximum time:     51.295 ms (83.89% GC)
  --------------
  samples:          785
  evals/sample:     1

In [57]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        a = @view A[i,:,:]
        c = @view C[i,:,:]
        result .= result .+ a*B*c
    end
end

sum_mult! (generic function with 1 method)

In [58]:
s = zeros(ComplexF64,39,39);

In [59]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  1.57 MiB
  allocs estimate:  594
  --------------
  minimum time:     3.890 ms (0.00% GC)
  median time:      4.484 ms (0.00% GC)
  mean time:        4.506 ms (3.01% GC)
  maximum time:     39.643 ms (88.73% GC)
  --------------
  samples:          1109
  evals/sample:     1

In [60]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        result .= result .+ A[i,:,:]*B*C[i,:,:]
    end
end

sum_mult! (generic function with 1 method)

In [61]:
s = zeros(ComplexF64,39,39);

In [62]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  3.08 MiB
  allocs estimate:  264
  --------------
  minimum time:     644.000 μs (0.00% GC)
  median time:      1.585 ms (0.00% GC)
  mean time:        1.777 ms (16.75% GC)
  maximum time:     52.727 ms (95.74% GC)
  --------------
  samples:          2805
  evals/sample:     1

In [63]:
size(C_array_const)[1]

36

In [64]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3},
                  idx_max::Int64,
                  shape::Array{Int64,1}
    )
    intermediate1 = zeros(ComplexF64, shape[1], shape[2])
    intermediate2 = zeros(ComplexF64, shape[1], shape[2])
    @inbounds for i = 1:idx_max
        mul!(intermediate1,B,C[i,:,:])
        mul!(intermediate2, A[i,:,:], intermediate1)
        result .+= intermediate2
    end
end

sum_mult! (generic function with 2 methods)

In [65]:
s = zeros(ComplexF64,39,39);

In [66]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const, 33,[39,39])

BenchmarkTools.Trial: 
  memory estimate:  1.59 MiB
  allocs estimate:  137
  --------------
  minimum time:     570.700 μs (0.00% GC)
  median time:      1.189 ms (0.00% GC)
  mean time:        1.312 ms (11.01% GC)
  maximum time:     60.194 ms (97.29% GC)
  --------------
  samples:          3801
  evals/sample:     1

In [67]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    sum_mult!(s,C_array_const,u,C_conj_array_const,33,[39,39])
    s .= s .- 1im .* Hamiltonian(t)*u
    s .= s .+ 1im .* Hamiltonian(t)*u
    s .= s .- 0.5 .* Cprecalc_const*u
    s .= s .- 0.5 .* u*Cprecalc_const
    return s
end

Lindblad_rhs (generic function with 1 method)

In [68]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.89 MiB
  allocs estimate:  163
  --------------
  minimum time:     664.400 μs (0.00% GC)
  median time:      1.370 ms (0.00% GC)
  mean time:        1.564 ms (12.59% GC)
  maximum time:     80.659 ms (97.91% GC)
  --------------
  samples:          3188
  evals/sample:     1

In [69]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    intermediate = zeros(ComplexF64,39,39)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const,33,[39,39])
    mul!(intermediate, Hamiltonian(t), u)
    du .-= 1im .* intermediate
    mul!(intermediate, u, Hamiltonian(t))
    du .+= 1im .* intermediate
    mul!(intermediate, Cprecalc_const, u)
    du .-= 0.5 .* intermediate
    mul!(intermediate, u, Cprecalc_const)
    du .-= 0.5 .* intermediate
end

Lindblad_rhs! (generic function with 1 method)

In [70]:
du = rand(ComplexF64,39,39);

In [71]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     678.400 μs (0.00% GC)
  median time:      1.327 ms (0.00% GC)
  mean time:        1.510 ms (11.07% GC)
  maximum time:     63.619 ms (96.65% GC)
  --------------
  samples:          3297
  evals/sample:     1

In [72]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     644.700 μs (0.00% GC)
  median time:      1.327 ms (0.00% GC)
  mean time:        1.506 ms (9.97% GC)
  maximum time:     50.044 ms (95.96% GC)
  --------------
  samples:          3307
  evals/sample:     1

In [73]:
function square_wave(t::Float64, frequency::Float64, phase::Float64)
    0.5.*(1+squarewave(2*pi.*frequency.*t .+ phase))
end

square_wave (generic function with 1 method)

In [74]:
@benchmark square_wave(1e-6, 1/(2*2e-6), 0.0)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.799 ns (0.00% GC)
  median time:      2.801 ns (0.00% GC)
  mean time:        2.853 ns (0.00% GC)
  maximum time:     7.700 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

In [75]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39).*square_wave(t, 1/(2*2e-6), 0.0)
end

Hamiltonian (generic function with 1 method)

In [76]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    intermediate = zeros(ComplexF64,39,39)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const,33,[39,39])
    mul!(intermediate, Hamiltonian(t), u)
    du .-= 1im .* intermediate
    mul!(intermediate, u, Hamiltonian(t))
    du .+= 1im .* intermediate
    mul!(intermediate, Cprecalc_const, u)
    du .-= 0.5 .* intermediate
    mul!(intermediate, u, Cprecalc_const)
    du .-= 0.5 .* intermediate
end

Lindblad_rhs! (generic function with 1 method)

In [77]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     635.001 μs (0.00% GC)
  median time:      1.317 ms (0.00% GC)
  mean time:        1.487 ms (11.68% GC)
  maximum time:     61.371 ms (97.20% GC)
  --------------
  samples:          3351
  evals/sample:     1

Still about a factor 1.5 to 2 slower than the numpy implementation.