In [7]:
using BenchmarkTools
using Unrolled

using SIMD

using ChipSort

In [8]:
function run_bench_mine(T, chunks, ::Val{N}, ::Val{L}) where {N, L}
    data = rand(T, chunks*N*L)
    stat = @benchmark chipsort_medium($data, Val($N), Val($L))
    stat
end

function run_bench_base(T, chunks, ::Val{N}, ::Val{L}) where {N, L}
    data = rand(T, chunks*N*L)
    stat = @benchmark sort($data)
    stat
end

run_bench_base (generic function with 1 method)

In [None]:
struct ExpRun
    code
    eltype
    data_size
    vec_size
    vec_count
    bench
end

function Base.show(io::IO, ee::ExpRun)
    print(ee.code, " ")
    print(ee.eltype, " ")
    print(ee.data_size, " ")
    print(ee.vec_size, " ")
    print(ee.vec_count, " ")
    print(ee.bench.times[div(end,2)])
end

@unroll function run_tests(data_size, nn, ll)
    exps = ExpRun[]
    T = Float32
    
    @unroll for cs in data_size
    @unroll for n in nn
        @unroll for l in ll
            tt = run_bench_mine(T, cs, Val(n), Val(l))
            ee = ExpRun(:chip, T, cs, n, l, tt)
            println(ee)
            push!(exps, ee)
            tt = run_bench_base(T, cs, Val(n), Val(l))
            ee = ExpRun(:juli, T, cs, n, l, tt)
            println(ee)
            push!(exps, ee)
        end
    end
    end
    exps
end

chunk_sizes = (16,8,4,2)
N = (8,4,2)
L = (8,4,2)

data = run_tests(chunk_sizes, N, L)

chip Float32 16 8 8 80679.0
juli Float32 16 8 8 20025.0
chip Float32 16 8 4 59059.0
juli Float32 16 8 4 4156.285714285715
chip Float32 16 8 2 52476.0
juli Float32 16 8 2 2052.5
chip Float32 16 4 8 58692.0
juli Float32 16 4 8 4275.571428571428
chip Float32 16 4 4 50782.0
juli Float32 16 4 4 2079.6
chip Float32 16 4 2 49593.0
juli Float32 16 4 2 837.9230769230769
chip Float32 16 2 8 55177.0
juli Float32 16 2 8 2096.6
chip Float32 16 2 4 49431.0
juli Float32 16 2 4 862.2608695652174
chip Float32 16 2 2 48473.0
juli Float32 16 2 2 371.9178743961353
chip Float32 8 8 8 27685.0
juli Float32 8 8 8 4318.285714285715
chip Float32 8 8 4 23871.0
juli Float32 8 8 4 2072.6
chip Float32 8 8 2 24108.0
juli Float32 8 8 2 847.9855072463768
chip Float32 8 4 8 23594.0
juli Float32 8 4 8 2049.9
chip Float32 8 4 4 22674.0
juli Float32 8 4 4 832.2278481012659
chip Float32 8 4 2 24909.0
