In [1]:
using Waveforms
using DifferentialEquations
using StaticArrays
using LinearAlgebra
using BenchmarkTools
using Distributions

In [2]:
BLAS.vendor()

:openblas64

In [3]:
const get_num_threads = function() # anonymous so it will be serialized when called
    blas = LinearAlgebra.BLAS.vendor()
    # Wrap in a try to catch unsupported blas versions
    try
        if blas == :openblas
            return ccall((:openblas_get_num_threads, Base.libblas_name), Cint, ())
        elseif blas == :openblas64
            return ccall((:openblas_get_num_threads64_, Base.libblas_name), Cint, ())
        elseif blas == :mkl
            return ccall((:MKL_Get_Max_Num_Threads, Base.libblas_name), Cint, ())
        end

        # OSX BLAS looks at an environment variable
        if Sys.isapple()
            return tryparse(Cint, get(ENV, "VECLIB_MAXIMUM_THREADS", "1"))
        end
    catch
    end

    return nothing
end

#3 (generic function with 1 method)

In [4]:
get_num_threads()

8

In [5]:
x = rand(5000,5000)
y = rand(5000,5000);

In [6]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  190.73 MiB
  allocs estimate:  2
  --------------
  minimum time:     1.080 s (0.00% GC)
  median time:      1.095 s (0.00% GC)
  mean time:        1.097 s (0.45% GC)
  maximum time:     1.118 s (2.17% GC)
  --------------
  samples:          5
  evals/sample:     1

In [7]:
z = zeros(5000,5000);

In [8]:
@benchmark mul!(z,x,y)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.050 s (0.00% GC)
  median time:      1.070 s (0.00% GC)
  mean time:        1.070 s (0.00% GC)
  maximum time:     1.093 s (0.00% GC)
  --------------
  samples:          5
  evals/sample:     1

## Static Arrays are faster but only recommended for use of arrays with less than 100 elements

In [9]:
x = @SMatrix rand(10,10)
y = @SMatrix rand(10,10);

In [10]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  816 bytes
  allocs estimate:  1
  --------------
  minimum time:     262.983 ns (0.00% GC)
  median time:      529.558 ns (0.00% GC)
  mean time:        572.633 ns (14.50% GC)
  maximum time:     52.927 μs (99.07% GC)
  --------------
  samples:          10000
  evals/sample:     362

In [11]:
x = rand(10,10)
y = rand(10,10);

In [12]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     337.904 ns (0.00% GC)
  median time:      642.466 ns (0.00% GC)
  mean time:        677.353 ns (13.40% GC)
  maximum time:     89.700 μs (99.17% GC)
  --------------
  samples:          10000
  evals/sample:     219

In [13]:
x = Hermitian(rand(39,39) + I)
y = Hermitian(rand(39,39));

In [14]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  24.13 KiB
  allocs estimate:  2
  --------------
  minimum time:     17.100 μs (0.00% GC)
  median time:      25.401 μs (0.00% GC)
  mean time:        30.110 μs (5.69% GC)
  maximum time:     17.172 ms (99.69% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [15]:
x = Matrix(Hermitian(rand(39,39) + I))
y = Matrix(Hermitian(rand(39,39)));

In [16]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     5.050 μs (0.00% GC)
  median time:      6.183 μs (0.00% GC)
  mean time:        7.396 μs (6.92% GC)
  maximum time:     2.910 ms (99.62% GC)
  --------------
  samples:          10000
  evals/sample:     6

In [17]:
x = rand(39,39)
y = rand(39,39);

In [18]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     5.000 μs (0.00% GC)
  median time:      5.217 μs (0.00% GC)
  mean time:        5.960 μs (1.34% GC)
  maximum time:     806.333 μs (98.77% GC)
  --------------
  samples:          10000
  evals/sample:     6

### Checking some other aspects of the Lindblad RHS

In [19]:
x = rand(ComplexF64, 39,39)
y = rand(ComplexF64, 39,39);

In [20]:
@benchmark -1im * (x*y-y*x)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     40.699 μs (0.00% GC)
  median time:      44.200 μs (0.00% GC)
  mean time:        53.716 μs (6.57% GC)
  maximum time:     14.998 ms (99.46% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [21]:
@benchmark -1im * (x*y-y*x) - 0.5*(y*x + x*y)

BenchmarkTools.Trial: 
  memory estimate:  215.02 KiB
  allocs estimate:  18
  --------------
  minimum time:     81.100 μs (0.00% GC)
  median time:      145.000 μs (0.00% GC)
  mean time:        154.582 μs (13.97% GC)
  maximum time:     29.071 ms (99.43% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [69]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [70]:
function multi_test(x::Array{Complex{Float64},2},y::Array{Complex{Float64},2})}
    -1im *(y*x-x*y) -0.5 * (y*x+x*y)
end

multi_test (generic function with 1 method)

In [71]:
@time multi_test(x,y);

MethodError: MethodError: no method matching multi_test(::Array{Complex{Float64},2}, ::Array{Complex{Float64},3})
Closest candidates are:
  multi_test(::Array{Complex{Float64},2}, !Matched::Array{Complex{Float64},2}) at In[70]:2

In [72]:
@benchmark multi_test(x,y)

MethodError: MethodError: no method matching multi_test(::Array{Complex{Float64},2}, ::Array{Complex{Float64},3})
Closest candidates are:
  multi_test(::Array{Complex{Float64},2}, !Matched::Array{Complex{Float64},2}) at In[70]:2

In [73]:
function sum_matrix(x,y)
    s = zeros(39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [74]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.614 ms (0.00% GC)
  median time:      2.918 ms (0.00% GC)
  mean time:        3.206 ms (17.09% GC)
  maximum time:     68.120 ms (95.28% GC)
  --------------
  samples:          1556
  evals/sample:     1

In [75]:
x = rand(Float64,39,39)
y = rand(Float64,36,39,39);

In [76]:
function sum_matrix(x,y)
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [77]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     555.699 μs (0.00% GC)
  median time:      714.149 μs (0.00% GC)
  mean time:        992.701 μs (13.48% GC)
  maximum time:     38.668 ms (96.69% GC)
  --------------
  samples:          5026
  evals/sample:     1

In [78]:
function sum_matrix(x::Array{Float64,2},y::Array{Float64,3})
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [79]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     554.301 μs (0.00% GC)
  median time:      708.100 μs (0.00% GC)
  mean time:        893.571 μs (11.98% GC)
  maximum time:     16.111 ms (91.48% GC)
  --------------
  samples:          5584
  evals/sample:     1

In [80]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [81]:
function sum_matrix(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [82]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.673 ms (0.00% GC)
  median time:      2.008 ms (0.00% GC)
  mean time:        2.663 ms (10.41% GC)
  maximum time:     56.540 ms (94.97% GC)
  --------------
  samples:          1873
  evals/sample:     1

In [83]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [84]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.661 ms (0.00% GC)
  median time:      3.010 ms (0.00% GC)
  mean time:        3.334 ms (16.02% GC)
  maximum time:     63.437 ms (94.66% GC)
  --------------
  samples:          1496
  evals/sample:     1

In [85]:
function sum_matrix_inbounds_sim(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds @simd for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds_sim (generic function with 1 method)

In [86]:
@benchmark sum_matrix_inbounds_sim(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.673 ms (0.00% GC)
  median time:      2.986 ms (0.00% GC)
  mean time:        3.277 ms (15.95% GC)
  maximum time:     62.220 ms (95.09% GC)
  --------------
  samples:          1523
  evals/sample:     1

In [87]:
function sum_matrix_threads(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    Threads.@threads for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
    s
end

sum_matrix_threads (generic function with 1 method)

In [88]:
sum_matrix_threads(x,y);

In [89]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  371
  --------------
  minimum time:     1.731 ms (0.00% GC)
  median time:      2.979 ms (0.00% GC)
  mean time:        3.252 ms (15.07% GC)
  maximum time:     63.610 ms (96.90% GC)
  --------------
  samples:          1535
  evals/sample:     1

In [90]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  371
  --------------
  minimum time:     1.691 ms (0.00% GC)
  median time:      2.998 ms (0.00% GC)
  mean time:        3.368 ms (17.14% GC)
  maximum time:     78.245 ms (96.24% GC)
  --------------
  samples:          1481
  evals/sample:     1

In [91]:
@time sum_matrix_threads(x,y);

  0.003549 seconds (371 allocations: 4.224 MiB)


In [92]:
@time sum_matrix_threads(x,y);

  0.003058 seconds (371 allocations: 4.224 MiB)


In [93]:
@time sum_matrix_inbounds_sim(x,y)

  0.003428 seconds (362 allocations: 4.223 MiB)


In [150]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s .= s .+ y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [151]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  3.38 MiB
  allocs estimate:  290
  --------------
  minimum time:     1.598 ms (0.00% GC)
  median time:      2.650 ms (0.00% GC)
  mean time:        2.986 ms (15.48% GC)
  maximum time:     87.838 ms (97.05% GC)
  --------------
  samples:          1693
  evals/sample:     1

In [152]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        a = @view y[i,:,:]
        b = @view y[i,:,:]
        s .= s .+ a*x*b
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [153]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  1.73 MiB
  allocs estimate:  650
  --------------
  minimum time:     4.319 ms (0.00% GC)
  median time:      4.868 ms (0.00% GC)
  mean time:        5.058 ms (4.87% GC)
  maximum time:     75.790 ms (93.88% GC)
  --------------
  samples:          988
  evals/sample:     1

In [154]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:33
        a = @view C_array_const[i,:,:]
        b = @view C_conj_array_const[i,:,:]
        s .= s .+ a*u*b
    end
    -1im .* (Hamiltonian(t).*u .- u*Hamiltonian(t)) .- 0.5 * (Cprecalc_const*u .+ u*Cprecalc_const) .+ s
end

Lindblad_rhs (generic function with 1 method)

In [155]:
const Cprecalc_const = rand(ComplexF64,39,39)
const C_array_const = rand(ComplexF64,36,39,39)
const C_conj_array_const = rand(ComplexF64,36,39,39);



In [156]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39)*t
end

Hamiltonian (generic function with 1 method)

In [157]:
u = rand(ComplexF64,39,39);

In [158]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.82 MiB
  allocs estimate:  616
  --------------
  minimum time:     4.138 ms (0.00% GC)
  median time:      4.698 ms (0.00% GC)
  mean time:        4.999 ms (5.99% GC)
  maximum time:     107.664 ms (95.81% GC)
  --------------
  samples:          1000
  evals/sample:     1

In [159]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        a = @view A[i,:,:]
        c = @view C[i,:,:]
        result .= result .+ a*B*c
    end
end

sum_mult! (generic function with 1 method)

In [160]:
s = zeros(ComplexF64,39,39);

In [161]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  1.57 MiB
  allocs estimate:  594
  --------------
  minimum time:     3.993 ms (0.00% GC)
  median time:      4.491 ms (0.00% GC)
  mean time:        4.656 ms (4.10% GC)
  maximum time:     90.976 ms (94.19% GC)
  --------------
  samples:          1073
  evals/sample:     1

In [162]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        result .= result .+ A[i,:,:]*B*C[i,:,:]
    end
end

sum_mult! (generic function with 1 method)

In [163]:
s = zeros(ComplexF64,39,39);

In [164]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  3.08 MiB
  allocs estimate:  264
  --------------
  minimum time:     1.476 ms (0.00% GC)
  median time:      2.487 ms (0.00% GC)
  mean time:        2.973 ms (19.98% GC)
  maximum time:     125.652 ms (98.02% GC)
  --------------
  samples:          1678
  evals/sample:     1

In [216]:
size(C_array_const)[1]

36

In [232]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3},
                  idx_max::Int64,
                  shape::Array{Int64,1}
    )
    intermediate1 = zeros(ComplexF64, shape[1], shape[2])
    intermediate2 = zeros(ComplexF64, shape[1], shape[2])
    @inbounds for i = 1:idx_max
        mul!(intermediate1,B,C[i,:,:])
        mul!(intermediate2, A[i,:,:], intermediate1)
        result .+= intermediate2
    end
end

sum_mult! (generic function with 4 methods)

In [233]:
s = zeros(ComplexF64,39,39);

In [234]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const, 33,[39,39])

BenchmarkTools.Trial: 
  memory estimate:  1.59 MiB
  allocs estimate:  137
  --------------
  minimum time:     1.446 ms (0.00% GC)
  median time:      1.954 ms (0.00% GC)
  mean time:        2.299 ms (16.07% GC)
  maximum time:     201.854 ms (99.06% GC)
  --------------
  samples:          2171
  evals/sample:     1

In [235]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    sum_mult!(s,C_array_const,u,C_conj_array_const,33,[39,39])
    s .= s .- 1im .* Hamiltonian(t)*u
    s .= s .+ 1im .* Hamiltonian(t)*u
    s .= s .- 0.5 .* Cprecalc_const*u
    s .= s .- 0.5 .* u*Cprecalc_const
    return s
end

Lindblad_rhs (generic function with 1 method)

In [236]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.89 MiB
  allocs estimate:  163
  --------------
  minimum time:     1.565 ms (0.00% GC)
  median time:      2.191 ms (0.00% GC)
  mean time:        2.565 ms (14.80% GC)
  maximum time:     150.214 ms (98.53% GC)
  --------------
  samples:          1945
  evals/sample:     1

In [244]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    intermediate = zeros(ComplexF64,39,39)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const,33,[39,39])
    mul!(intermediate, Hamiltonian(t), u)
    du .-= 1im .* intermediate
    mul!(intermediate, u, Hamiltonian(t))
    du .+= 1im .* intermediate
    mul!(intermediate, Cprecalc_const, u)
    du .-= 0.5 .* intermediate
    mul!(intermediate, u, Cprecalc_const)
    du .-= 0.5 .* intermediate
end

Lindblad_rhs! (generic function with 1 method)

In [245]:
du = rand(ComplexF64,39,39);

In [246]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     1.559 ms (0.00% GC)
  median time:      2.101 ms (0.00% GC)
  mean time:        2.656 ms (21.85% GC)
  maximum time:     368.673 ms (99.44% GC)
  --------------
  samples:          1879
  evals/sample:     1

In [242]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.87 MiB
  allocs estimate:  161
  --------------
  minimum time:     1.571 ms (0.00% GC)
  median time:      2.174 ms (0.00% GC)
  mean time:        2.549 ms (15.89% GC)
  maximum time:     238.618 ms (99.08% GC)
  --------------
  samples:          1958
  evals/sample:     1

In [111]:
function square_wave(t::Float64, frequency::Float64, phase::Float64)
    0.5.*(1+squarewave(2*pi.*frequency.*t .+ phase))
end

square_wave (generic function with 1 method)

In [112]:
@benchmark square_wave(1e-6, 1/(2*2e-6), 0.0)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.499 ns (0.00% GC)
  median time:      2.600 ns (0.00% GC)
  mean time:        2.625 ns (0.00% GC)
  maximum time:     10.100 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

In [249]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39).*square_wave(t, 1/(2*2e-6), 0.0)
end

Hamiltonian (generic function with 1 method)

In [250]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    intermediate = zeros(ComplexF64,39,39)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const,33,[39,39])
    mul!(intermediate, Hamiltonian(t), u)
    du .-= 1im .* intermediate
    mul!(intermediate, u, Hamiltonian(t))
    du .+= 1im .* intermediate
    mul!(intermediate, Cprecalc_const, u)
    du .-= 0.5 .* intermediate
    mul!(intermediate, u, Cprecalc_const)
    du .-= 0.5 .* intermediate
end

Lindblad_rhs! (generic function with 1 method)

In [251]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  147
  --------------
  minimum time:     1.570 ms (0.00% GC)
  median time:      2.110 ms (0.00% GC)
  mean time:        2.481 ms (15.30% GC)
  maximum time:     197.407 ms (98.80% GC)
  --------------
  samples:          2011
  evals/sample:     1

Still about a factor 1.5 to 2 slower than the numpy implementation.