In [1]:
using Waveforms
using DifferentialEquations
using StaticArrays
using LinearAlgebra
using BenchmarkTools
using Distributions

In [2]:
BLAS.vendor()

:openblas64

In [3]:
const get_num_threads = function() # anonymous so it will be serialized when called
    blas = LinearAlgebra.BLAS.vendor()
    # Wrap in a try to catch unsupported blas versions
    try
        if blas == :openblas
            return ccall((:openblas_get_num_threads, Base.libblas_name), Cint, ())
        elseif blas == :openblas64
            return ccall((:openblas_get_num_threads64_, Base.libblas_name), Cint, ())
        elseif blas == :mkl
            return ccall((:MKL_Get_Max_Num_Threads, Base.libblas_name), Cint, ())
        end

        # OSX BLAS looks at an environment variable
        if Sys.isapple()
            return tryparse(Cint, get(ENV, "VECLIB_MAXIMUM_THREADS", "1"))
        end
    catch
    end

    return nothing
end

#1 (generic function with 1 method)

In [4]:
get_num_threads()

8

In [5]:
x = rand(5000,5000)
y = rand(5000,5000);

In [6]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  190.73 MiB
  allocs estimate:  2
  --------------
  minimum time:     855.796 ms (0.00% GC)
  median time:      874.424 ms (0.00% GC)
  mean time:        876.842 ms (0.36% GC)
  maximum time:     903.093 ms (2.01% GC)
  --------------
  samples:          6
  evals/sample:     1

In [7]:
z = zeros(5000,5000);

In [8]:
@benchmark mul!(z,x,y)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     805.031 ms (0.00% GC)
  median time:      820.979 ms (0.00% GC)
  mean time:        822.798 ms (0.00% GC)
  maximum time:     838.818 ms (0.00% GC)
  --------------
  samples:          7
  evals/sample:     1

## Static Arrays are faster but only recommended for use of arrays with less than 100 elements

In [9]:
x = @SMatrix rand(10,10)
y = @SMatrix rand(10,10);

In [10]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  816 bytes
  allocs estimate:  1
  --------------
  minimum time:     290.217 ns (0.00% GC)
  median time:      639.496 ns (0.00% GC)
  mean time:        658.391 ns (15.43% GC)
  maximum time:     73.588 μs (99.04% GC)
  --------------
  samples:          10000
  evals/sample:     276

In [11]:
x = rand(10,10)
y = rand(10,10);

In [12]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     341.284 ns (0.00% GC)
  median time:      735.321 ns (0.00% GC)
  mean time:        751.507 ns (13.95% GC)
  maximum time:     94.697 μs (99.30% GC)
  --------------
  samples:          10000
  evals/sample:     218

In [13]:
x = Hermitian(rand(39,39) + I)
y = Hermitian(rand(39,39));

In [14]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  24.13 KiB
  allocs estimate:  2
  --------------
  minimum time:     72.301 μs (0.00% GC)
  median time:      109.800 μs (0.00% GC)
  mean time:        115.255 μs (1.68% GC)
  maximum time:     19.655 ms (98.28% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [15]:
x = Matrix(Hermitian(rand(39,39) + I))
y = Matrix(Hermitian(rand(39,39)));

In [16]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     4.171 μs (0.00% GC)
  median time:      7.871 μs (0.00% GC)
  mean time:        10.839 μs (11.52% GC)
  maximum time:     3.797 ms (99.58% GC)
  --------------
  samples:          10000
  evals/sample:     7

In [17]:
x = rand(39,39)
y = rand(39,39);

In [18]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     4.243 μs (0.00% GC)
  median time:      5.386 μs (0.00% GC)
  mean time:        7.996 μs (8.34% GC)
  maximum time:     2.852 ms (99.68% GC)
  --------------
  samples:          10000
  evals/sample:     7

### Checking some other aspects of the Lindblad RHS

In [19]:
x = rand(ComplexF64, 39,39)
y = rand(ComplexF64, 39,39);

In [20]:
@benchmark -1im * (x*y-y*x)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     142.600 μs (0.00% GC)
  median time:      253.800 μs (0.00% GC)
  mean time:        266.165 μs (4.23% GC)
  maximum time:     20.869 ms (98.65% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [21]:
@benchmark -1im * (x*y-y*x) - 0.5*(y*x + x*y)

BenchmarkTools.Trial: 
  memory estimate:  215.02 KiB
  allocs estimate:  18
  --------------
  minimum time:     293.299 μs (0.00% GC)
  median time:      517.501 μs (0.00% GC)
  mean time:        543.698 μs (4.62% GC)
  maximum time:     25.009 ms (96.84% GC)
  --------------
  samples:          9175
  evals/sample:     1

In [22]:
x = rand(39,39)
y = rand(36,39,39);

In [23]:
function multi_test(x::Array{Complex{Float64},2},y::Array{Complex{Float64},2})::Array{Complex{Float64},2}
    -1im *(y*x-x*y) -0.5 * (y*x+x*y)
end

multi_test (generic function with 1 method)

In [24]:
@time multi_test(x,y);

LoadError: MethodError: no method matching multi_test(::Array{Float64,2}, ::Array{Float64,3})

In [25]:
@benchmark multi_test(x,y)

LoadError: MethodError: no method matching multi_test(::Array{Float64,2}, ::Array{Float64,3})

In [26]:
function sum_matrix(x,y)
    s = zeros(39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 1 method)

In [27]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     473.101 μs (0.00% GC)
  median time:      549.600 μs (0.00% GC)
  mean time:        796.048 μs (11.86% GC)
  maximum time:     9.726 ms (87.44% GC)
  --------------
  samples:          6266
  evals/sample:     1

In [28]:
x = rand(Float64,39,39)
y = rand(Float64,36,39,39);

In [29]:
function sum_matrix(x,y)
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 1 method)

In [30]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     476.500 μs (0.00% GC)
  median time:      564.000 μs (0.00% GC)
  mean time:        704.535 μs (10.53% GC)
  maximum time:     24.333 ms (96.88% GC)
  --------------
  samples:          7098
  evals/sample:     1

In [31]:
function sum_matrix(x::Array{Float64,2},y::Array{Float64,3})
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 2 methods)

In [32]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     477.000 μs (0.00% GC)
  median time:      564.899 μs (0.00% GC)
  mean time:        701.659 μs (10.58% GC)
  maximum time:     22.228 ms (95.99% GC)
  --------------
  samples:          7121
  evals/sample:     1

In [33]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [34]:
function sum_matrix(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [35]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     7.367 ms (0.00% GC)
  median time:      8.763 ms (0.00% GC)
  mean time:        9.360 ms (2.42% GC)
  maximum time:     27.896 ms (62.98% GC)
  --------------
  samples:          535
  evals/sample:     1

In [36]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [37]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     7.706 ms (0.00% GC)
  median time:      10.154 ms (0.00% GC)
  mean time:        10.573 ms (4.69% GC)
  maximum time:     42.839 ms (75.70% GC)
  --------------
  samples:          473
  evals/sample:     1

In [38]:
function sum_matrix_inbounds_sim(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds @simd for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds_sim (generic function with 1 method)

In [39]:
@benchmark sum_matrix_inbounds_sim(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     5.484 ms (0.00% GC)
  median time:      9.094 ms (0.00% GC)
  mean time:        9.444 ms (4.75% GC)
  maximum time:     45.671 ms (76.28% GC)
  --------------
  samples:          531
  evals/sample:     1

In [40]:
function sum_matrix_threads(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    Threads.@threads for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
    s
end

sum_matrix_threads (generic function with 1 method)

In [41]:
sum_matrix_threads(x,y);

In [42]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  369
  --------------
  minimum time:     6.239 ms (0.00% GC)
  median time:      9.247 ms (0.00% GC)
  mean time:        9.678 ms (4.47% GC)
  maximum time:     38.476 ms (72.09% GC)
  --------------
  samples:          517
  evals/sample:     1

In [53]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  369
  --------------
  minimum time:     6.952 ms (0.00% GC)
  median time:      9.802 ms (0.00% GC)
  mean time:        10.204 ms (4.53% GC)
  maximum time:     54.762 ms (77.73% GC)
  --------------
  samples:          490
  evals/sample:     1

In [54]:
@time sum_matrix_threads(x,y);

  0.012857 seconds (369 allocations: 4.224 MiB)


In [55]:
@time sum_matrix_threads(x,y);

  0.010220 seconds (371 allocations: 4.224 MiB)


In [56]:
@time sum_matrix_inbounds_sim(x,y)

  0.010319 seconds (362 allocations: 4.223 MiB)


In [62]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s .= s .+ y[i,:,:].*x.*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [63]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  1.70 MiB
  allocs estimate:  146
  --------------
  minimum time:     336.300 μs (0.00% GC)
  median time:      960.300 μs (0.00% GC)
  mean time:        977.266 μs (18.93% GC)
  maximum time:     37.148 ms (97.36% GC)
  --------------
  samples:          5147
  evals/sample:     1

In [64]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        a = @view y[i,:,:]
        b = @view y[i,:,:]
        s .= s .+ a.*x.*b
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [65]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  23.89 KiB
  allocs estimate:  2
  --------------
  minimum time:     171.200 μs (0.00% GC)
  median time:      189.400 μs (0.00% GC)
  mean time:        197.745 μs (1.88% GC)
  maximum time:     37.411 ms (99.37% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [61]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:33
        a = @view C_array_const[i,:,:]
        b = @view C_conj_array_const[i,:,:]
        s .= s .+ a.*u.*b
    end
    -1im .* (Hamiltonian(t).*u .- u.*Hamiltonian(t)) .- 0.5 .* (Cprecalc_const.*u .+ u.*Cprecalc_const) .+ s
end

Lindblad_rhs (generic function with 2 methods)

In [73]:
const Cprecalc_const = rand(ComplexF64,39,39)
const C_array_const = rand(ComplexF64,36,39,39)
const C_conj_array_const = rand(ComplexF64,36,39,39);



In [74]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39).*t
end

Hamiltonian (generic function with 1 method)

In [106]:
u = rand(ComplexF64,39,39);

In [76]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  143.34 KiB
  allocs estimate:  12
  --------------
  minimum time:     311.801 μs (0.00% GC)
  median time:      385.200 μs (0.00% GC)
  mean time:        402.042 μs (5.37% GC)
  maximum time:     53.020 ms (99.28% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [89]:
function sum_mult!(result::Array{Complex{Float64},2}, 
                  A::Array{Complex{Float64},3},
                  B::Array{Complex{Float64},2}, 
                  C::Array{Complex{Float64},3})
    @inbounds for i = 1:33
        a = @view A[i,:,:]
        c = @view C[i,:,:]
        result .= result .+ a.*B.*c
    end
end

sum_mult! (generic function with 1 method)

In [95]:
s = zeros(ComplexF64,39,39);

In [97]:
@benchmark sum_mult!(s,C_array_const,u,C_conj_array_const)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     233.900 μs (0.00% GC)
  median time:      243.800 μs (0.00% GC)
  mean time:        249.805 μs (0.00% GC)
  maximum time:     742.301 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [99]:
function Lindblad_rhs(u::Array{Complex{Float64},2},t::Float64)
    s = zeros(ComplexF64,39,39)
    sum_mult!(s,C_array_const,u,C_conj_array_const)
    s .= s .- 1im .* Hamiltonian(t).*u
    s .= s .+ 1im .* Hamiltonian(t).*u
    s .= s .- 0.5 .* Cprecalc_const.*u
    s .= s .- 0.5 .* u.*Cprecalc_const
    return s
end

Lindblad_rhs (generic function with 3 methods)

In [100]:
@benchmark Lindblad_rhs(u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  119.45 KiB
  allocs estimate:  10
  --------------
  minimum time:     305.900 μs (0.00% GC)
  median time:      371.800 μs (0.00% GC)
  mean time:        386.310 μs (4.38% GC)
  maximum time:     52.048 ms (99.26% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [113]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const)
    du .= du .- 1im .* Hamiltonian(t).*u
    du .= du .+ 1im .* Hamiltonian(t).*u
    du .= du .- 0.5 .* Cprecalc_const.*u
    du .= du .- 0.5 .* u.*Cprecalc_const
end

Lindblad_rhs! (generic function with 1 method)

In [114]:
du = rand(ComplexF64,39,39);

In [115]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     303.200 μs (0.00% GC)
  median time:      360.501 μs (0.00% GC)
  mean time:        369.364 μs (2.82% GC)
  maximum time:     47.758 ms (98.98% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [119]:
function square_wave(t::Float64, frequency::Float64, phase::Float64)
    0.5.*(1+squarewave(2*pi.*frequency.*t .+ phase))
end

square_wave (generic function with 1 method)

In [121]:
@benchmark square_wave(1e-6, 1/(2*2e-6), 0.0)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.299 ns (0.00% GC)
  median time:      3.400 ns (0.00% GC)
  mean time:        3.483 ns (0.00% GC)
  maximum time:     19.399 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

In [128]:
function Hamiltonian(t::Float64)::Array{Complex{Float64},2}
    rand(ComplexF64,39,39).*square_wave(t, 1/(2*2e-6), 0.0)
end

Hamiltonian (generic function with 1 method)

In [129]:
function Lindblad_rhs!(du::Array{Complex{Float64}}, u::Array{Complex{Float64},2},t::Float64)
    fill!(du,0.0)
    sum_mult!(du,C_array_const,u,C_conj_array_const)
    du .= du .- 1im .* Hamiltonian(t).*u
    du .= du .+ 1im .* Hamiltonian(t).*u
    du .= du .- 0.5 .* Cprecalc_const.*u
    du .= du .- 0.5 .* u.*Cprecalc_const
end

Lindblad_rhs! (generic function with 1 method)

In [130]:
@benchmark Lindblad_rhs!(du, u,1e-6)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     308.100 μs (0.00% GC)
  median time:      364.200 μs (0.00% GC)
  mean time:        373.702 μs (2.84% GC)
  maximum time:     43.560 ms (99.15% GC)
  --------------
  samples:          10000
  evals/sample:     1