In [1]:
using Waveforms
using DifferentialEquations
using StaticArrays
using LinearAlgebra
using BenchmarkTools
using Distributions

In [2]:
BLAS.vendor()

:openblas64

In [3]:
const get_num_threads = function() # anonymous so it will be serialized when called
    blas = LinearAlgebra.BLAS.vendor()
    # Wrap in a try to catch unsupported blas versions
    try
        if blas == :openblas
            return ccall((:openblas_get_num_threads, Base.libblas_name), Cint, ())
        elseif blas == :openblas64
            return ccall((:openblas_get_num_threads64_, Base.libblas_name), Cint, ())
        elseif blas == :mkl
            return ccall((:MKL_Get_Max_Num_Threads, Base.libblas_name), Cint, ())
        end

        # OSX BLAS looks at an environment variable
        if Sys.isapple()
            return tryparse(Cint, get(ENV, "VECLIB_MAXIMUM_THREADS", "1"))
        end
    catch
    end

    return nothing
end

#3 (generic function with 1 method)

In [4]:
get_num_threads()

8

In [5]:
x = rand(5000,5000)
y = rand(5000,5000);

In [6]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  190.73 MiB
  allocs estimate:  2
  --------------
  minimum time:     1.079 s (0.00% GC)
  median time:      1.084 s (0.00% GC)
  mean time:        1.090 s (0.44% GC)
  maximum time:     1.109 s (2.14% GC)
  --------------
  samples:          5
  evals/sample:     1

In [7]:
z = zeros(5000,5000);

In [8]:
@benchmark mul!(z,x,y)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.070 s (0.00% GC)
  median time:      1.071 s (0.00% GC)
  mean time:        1.074 s (0.00% GC)
  maximum time:     1.083 s (0.00% GC)
  --------------
  samples:          5
  evals/sample:     1

## Static Arrays are faster but only recommended for use of arrays with less than 100 elements

In [9]:
x = @SMatrix rand(10,10)
y = @SMatrix rand(10,10);

In [10]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  816 bytes
  allocs estimate:  1
  --------------
  minimum time:     260.043 ns (0.00% GC)
  median time:      522.833 ns (0.00% GC)
  mean time:        539.211 ns (14.35% GC)
  maximum time:     39.215 μs (98.54% GC)
  --------------
  samples:          10000
  evals/sample:     438

In [11]:
x = rand(10,10)
y = rand(10,10);

In [12]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     342.134 ns (0.00% GC)
  median time:      632.403 ns (0.00% GC)
  mean time:        652.256 ns (13.19% GC)
  maximum time:     76.574 μs (98.50% GC)
  --------------
  samples:          10000
  evals/sample:     216

In [13]:
x = Hermitian(rand(39,39) + I)
y = Hermitian(rand(39,39));

In [14]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  24.13 KiB
  allocs estimate:  2
  --------------
  minimum time:     15.699 μs (0.00% GC)
  median time:      25.200 μs (0.00% GC)
  mean time:        29.749 μs (5.62% GC)
  maximum time:     16.793 ms (99.60% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [15]:
x = Matrix(Hermitian(rand(39,39) + I))
y = Matrix(Hermitian(rand(39,39)));

In [16]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     5.050 μs (0.00% GC)
  median time:      5.300 μs (0.00% GC)
  mean time:        6.619 μs (4.64% GC)
  maximum time:     1.059 ms (98.36% GC)
  --------------
  samples:          10000
  evals/sample:     6

In [17]:
x = rand(39,39)
y = rand(39,39);

In [18]:
@benchmark z = x*y

BenchmarkTools.Trial: 
  memory estimate:  12.06 KiB
  allocs estimate:  1
  --------------
  minimum time:     5.117 μs (0.00% GC)
  median time:      5.317 μs (0.00% GC)
  mean time:        6.424 μs (3.84% GC)
  maximum time:     662.516 μs (98.32% GC)
  --------------
  samples:          10000
  evals/sample:     6

### Checking some other aspects of the Lindblad RHS

In [95]:
x = rand(ComplexF64, 39,39)
y = rand(ComplexF64, 39,39);

In [96]:
@benchmark -1im * (x*y-y*x)

BenchmarkTools.Trial: 
  memory estimate:  95.56 KiB
  allocs estimate:  8
  --------------
  minimum time:     39.000 μs (0.00% GC)
  median time:      42.801 μs (0.00% GC)
  mean time:        58.039 μs (9.98% GC)
  maximum time:     40.315 ms (99.79% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [97]:
@benchmark -1im * (x*y-y*x) - 0.5*(y*x + x*y)

BenchmarkTools.Trial: 
  memory estimate:  215.02 KiB
  allocs estimate:  18
  --------------
  minimum time:     79.701 μs (0.00% GC)
  median time:      141.501 μs (0.00% GC)
  mean time:        146.891 μs (14.95% GC)
  maximum time:     59.140 ms (99.71% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [54]:
x = rand(39,39)
y = rand(36,39,39);

In [100]:
function multi_test(x::Array{Complex{Float64},2},y::Array{Complex{Float64},2})::Array{Complex{Float64},2}
    -1im *(y*x-x*y) -0.5 * (y*x+x*y)
end

multi_test (generic function with 1 method)

In [103]:
@time multi_test(x,y);

  0.000146 seconds (18 allocations: 215.016 KiB)


In [104]:
@benchmark multi_test(x,y)

BenchmarkTools.Trial: 
  memory estimate:  215.02 KiB
  allocs estimate:  18
  --------------
  minimum time:     79.900 μs (0.00% GC)
  median time:      142.500 μs (0.00% GC)
  mean time:        151.308 μs (14.83% GC)
  maximum time:     45.376 ms (99.64% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [55]:
function sum_matrix(x,y)
    s = zeros(39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [56]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     556.101 μs (0.00% GC)
  median time:      660.800 μs (0.00% GC)
  mean time:        933.203 μs (12.47% GC)
  maximum time:     35.667 ms (97.69% GC)
  --------------
  samples:          5333
  evals/sample:     1

In [57]:
x = rand(Float64,39,39)
y = rand(Float64,36,39,39);

In [58]:
function sum_matrix(x,y)
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 3 methods)

In [59]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     566.400 μs (0.00% GC)
  median time:      670.100 μs (0.00% GC)
  mean time:        907.798 μs (11.64% GC)
  maximum time:     15.400 ms (94.74% GC)
  --------------
  samples:          5458
  evals/sample:     1

In [60]:
function sum_matrix(x::Array{Float64,2},y::Array{Float64,3})
    s = zeros(Float64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 4 methods)

In [61]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  181
  --------------
  minimum time:     563.801 μs (0.00% GC)
  median time:      672.399 μs (0.00% GC)
  mean time:        914.078 μs (11.84% GC)
  maximum time:     33.181 ms (96.56% GC)
  --------------
  samples:          5446
  evals/sample:     1

In [62]:
x = rand(ComplexF64,39,39)
y = rand(ComplexF64,36,39,39);

In [63]:
function sum_matrix(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    for i = 1:36
        s = s + y[i,:,:]*x*y[i,:,:]
    end
end

sum_matrix (generic function with 5 methods)

In [64]:
@benchmark sum_matrix(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.580 ms (0.00% GC)
  median time:      2.882 ms (0.00% GC)
  mean time:        2.873 ms (13.18% GC)
  maximum time:     37.455 ms (91.60% GC)
  --------------
  samples:          1736
  evals/sample:     1

In [74]:
function sum_matrix_inbounds(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds (generic function with 1 method)

In [75]:
@benchmark sum_matrix_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.633 ms (0.00% GC)
  median time:      2.918 ms (0.00% GC)
  mean time:        2.870 ms (12.48% GC)
  maximum time:     44.947 ms (93.50% GC)
  --------------
  samples:          1739
  evals/sample:     1

In [72]:
function sum_matrix_inbounds_sim(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    @inbounds @simd for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
end

sum_matrix_inbounds_sim (generic function with 1 method)

In [73]:
@benchmark sum_matrix_inbounds_sim(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.22 MiB
  allocs estimate:  362
  --------------
  minimum time:     1.629 ms (0.00% GC)
  median time:      2.898 ms (0.00% GC)
  mean time:        2.865 ms (12.70% GC)
  maximum time:     42.250 ms (93.28% GC)
  --------------
  samples:          1742
  evals/sample:     1

In [76]:
function sum_matrix_threads(x::Array{Complex{Float64},2},y::Array{Complex{Float64},3})
    s = zeros(ComplexF64,39,39)
    Threads.@threads for i = 1:36
        s = s + (y[i,:,:]*x)*y[i,:,:]
    end
    s
end

sum_matrix_threads (generic function with 2 methods)

In [77]:
sum_matrix_threads(x,y);

In [78]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.23 MiB
  allocs estimate:  396
  --------------
  minimum time:     557.800 μs (0.00% GC)
  median time:      1.872 ms (0.00% GC)
  mean time:        46.311 ms (96.15% GC)
  maximum time:     19.505 s (99.99% GC)
  --------------
  samples:          438
  evals/sample:     1

In [79]:
@benchmark sum_matrix_threads(x,y)

BenchmarkTools.Trial: 
  memory estimate:  4.23 MiB
  allocs estimate:  396
  --------------
  minimum time:     546.800 μs (0.00% GC)
  median time:      1.849 ms (0.00% GC)
  mean time:        47.159 ms (96.24% GC)
  maximum time:     20.153 s (99.99% GC)
  --------------
  samples:          444
  evals/sample:     1

In [92]:
@time sum_matrix_threads(x,y);

  0.001335 seconds (399 allocations: 4.228 MiB)


In [93]:
@time sum_matrix_threads(x,y);

  0.001399 seconds (397 allocations: 4.228 MiB)


In [94]:
@time sum_matrix_inbounds_sim(x,y)

  0.002848 seconds (362 allocations: 4.223 MiB)
