In [1]:
using BenchmarkTools
using Unrolled

using SIMD

using ChipSort

In [2]:
function run_bench_mine(T, ::Val{C}, ::Val{N}, ::Val{L}) where {C, N, L}
    data = rand(T, C*N*L)
    stat = @benchmark chipsort_medium($data, Val($C), Val($N), Val($L))
    stat
end

function run_bench_base(T, ::Val{C}, ::Val{N}, ::Val{L}) where {C, N, L}
    data = rand(T, C*N*L)
    stat = @benchmark sort($data)
    stat
end

run_bench_base (generic function with 1 method)

In [3]:
struct ExpRun
    code
    eltype
    data_size
    vec_size
    vec_count
    bench
end

function Base.show(io::IO, ee::ExpRun)
    print(ee.code, " ")
    print(ee.eltype, " ")
    print(ee.data_size, " ")
    print(ee.vec_size, " ")
    print(ee.vec_count, " ")
    print(ee.bench.times[div(end,2)])
end

@unroll function run_tests(cc, nn)
    exps = ExpRun[]
    T = Int32
    
    @unroll for c in cc
        @unroll for n in nn
            tt = run_bench_mine(T, Val(c), Val(n), Val(n))
            ee = ExpRun(:chip, T, c*n*n, n, n, tt)
            println(ee)
            push!(exps, ee)
            tt = run_bench_base(T, Val(c), Val(n), Val(n))
            ee = ExpRun(:juli, T, c*n*n, n, n, tt)
            println(ee)
            push!(exps, ee)
        end
    end
    exps
end

C = (16,8,4,2)
N = (8,4,2)

data = run_tests(C, N)

chip Int32 1024 8 8 68140.0
juli Int32 1024 8 8 20487.0
chip Int32 256 4 4 42045.0
juli Int32 256 4 4 1692.0
chip Int32 64 2 2 38615.0
juli Int32 64 2 2 336.0765765765766
chip Int32 512 8 8 23428.0
juli Int32 512 8 8 3786.75
chip Int32 128 4 4 18717.0
juli Int32 128 4 4 688.0326797385621
chip Int32 32 2 2 19788.0
juli Int32 32 2 2 170.83428571428573
chip Int32 256 8 8 16077.0
juli Int32 256 8 8 1675.9
chip Int32 64 4 4 12671.0
juli Int32 64 4 4 319.1525423728813
chip Int32 16 2 2 12064.0
juli Int32 16 2 2 110.62679955703211
chip Int32 128 8 8 11845.0
juli Int32 128 8 8 694.3525641025641
chip Int32 32 4 4 10792.0
juli Int32 32 4 4 172.865
chip Int32 8 2 2 11174.0
juli Int32 8 2 2 61.907120743034056
chip 

24-element Array{ExpRun,1}:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Int32 1024 8 8 68140.0juli Int32 1024 8 8 20487.0chip Int32 256 4 4 42045.0juli Int32 256 4 4 1692.0chip Int32 64 2 2 38615.0juli Int32 64 2 2 336.0765765765766chip Int32 512 8 8 23428.0juli Int32 512 8 8 3786.75chip Int32 128 4 4 18717.0juli Int32 128 4 4 688.0326797385621chip Int32 32 2 2 19788.0juli Int32 32 2 2 170.83428571428573chip Int32 256 8 8 16077.0juli Int32 256 8 8 1675.9chip Int32 64 4 4 12671.0juli Int32 64 4 4 319.1525423728813chip Int32 16 2 2 12064.0juli Int32 16 2 2 110.62679955703211chip Int32 128 8 8 11845.0juli Int32 128 8 8 694.3525641025641chip Int32 32 4 4 10792.0juli Int32 32 4 4 172.865chip Int32 8 2 2 11174.0juli Int32 8 2 2 61.907120743034056chip Int32 1024 8 8 68140.0chip Int32 1024 8 8 68140.0juli Int32 1024 8 8 20487.0juli Int32 1024 8 8 20487.0chip Int32 256 4 4 42045.0chip Int32 256 4 4 42045.0juli Int32 256 4 4 1692.0juli Int32 256 4 4 1692.0chip Int32 64 2 2 38615.0chip Int32 64 2 2 38615.0juli Int32 64 2 2 336.0765765765766juli Int32 64 2 2 336.07657

In [7]:
# using Profile
# Profile.init(delay = 0.0001)
# function run_many(data,n)
#     for i in 1:n
#         chipsort_medium(data, Val(8), Val(8)) 
#     end
# end

# data = rand(Int32, 2^10)
# @profile run_many(data, 1)