https://github.com/mitmath/18S096/blob/master/lectures/lecture1/Performance-variation.ipynb
https://github.com/mitmath/18S096/blob/master/lectures/lecture1/Boxes-and-registers.ipynb

In [1]:
using BenchmarkTools
using Cxx
using PyCall

In [2]:
x = rand(10^7);
d = Dict() # to store the measurement results

Dict{Any,Any} with 0 entries

## Hand-written C

In [51]:
c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

"#include <stddef.h>\ndouble c_sum(size_t n, double *X) {\n    double s = 0.0;\n    for (size_t i = 0; i < n; ++i) {\n        s += X[i];\n    }\n    return s;\n}\n"

In [52]:
# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
const Clib = tempname()
using Libdl



In [53]:
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, c_code)
end

In [54]:
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)

c_sum (generic function with 1 method)

In [55]:
c_sum(x) ≈ sum(x)

true

In [56]:
b = @benchmark c_sum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     10.593 ms (0.00% GC)
  median time:      11.498 ms (0.00% GC)
  mean time:        11.598 ms (0.00% GC)
  maximum time:     14.854 ms (0.00% GC)
  --------------
  samples:          431
  evals/sample:     1

In [12]:
d["C"] = minimum(b.times) / 1e6

10.654601

## Hand-written C++

In [13]:
cpp_code = cxx"""
double sum_array(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

true

In [14]:
cpp_sum(xs) = @cxx sum_array(length(xs), pointer(xs))

cpp_sum (generic function with 1 method)

In [15]:
b = @benchmark cpp_sum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     24.616 ms (0.00% GC)
  median time:      25.828 ms (0.00% GC)
  mean time:        26.262 ms (0.00% GC)
  maximum time:     49.594 ms (0.00% GC)
  --------------
  samples:          191
  evals/sample:     1

In [16]:
d["C++"] = minimum(b.times) / 1e6

24.6163

## Built-in Python / numpy `sum`

In [17]:
# call a low-level PyCall function to get a Python list, because
# by default PyCall will convert to a NumPy array instead (we benchmark NumPy below):
xpy_list = PyCall.array2py(x)
# get the Python built-in "sum" function:
pysum = pybuiltin("sum")

b = @benchmark pysum($xpy_list)

BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  3
  --------------
  minimum time:     35.040 ms (0.00% GC)
  median time:      36.516 ms (0.00% GC)
  mean time:        37.020 ms (0.00% GC)
  maximum time:     49.330 ms (0.00% GC)
  --------------
  samples:          135
  evals/sample:     1

In [18]:
d["Python (built-in)"] = minimum(b.times) / 1e6

35.0397

In [19]:
numpy_sum = pyimport("numpy").sum
xpy_numpy = PyObject(x) # converts to a numpy array by default
b = @benchmark $numpy_sum($xpy_numpy)

BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  3
  --------------
  minimum time:     9.787 ms (0.00% GC)
  median time:      10.802 ms (0.00% GC)
  mean time:        11.067 ms (0.00% GC)
  maximum time:     22.822 ms (0.00% GC)
  --------------
  samples:          452
  evals/sample:     1

In [20]:
d["Python (numpy)"] = minimum(b.times) / 1e6

9.7873

## Hand-written Python

In [21]:
py"""
def mysum(a):
    s = 0.0
    for x in a:
        s = s + x
    return s
"""
mysum_py = py"mysum"

PyObject <function mysum at 0x00000000390BEBF8>

In [22]:
b = @benchmark $mysum_py($xpy_list)

BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  3
  --------------
  minimum time:     263.458 ms (0.00% GC)
  median time:      272.664 ms (0.00% GC)
  mean time:        274.377 ms (0.00% GC)
  maximum time:     285.621 ms (0.00% GC)
  --------------
  samples:          19
  evals/sample:     1

In [23]:
d["Python (hand-written)"] = minimum(b.times) / 1e6

263.4577

In [24]:
# @btime $mysum_py($xpy_numpy)

## Built-in Julia `sum`

In [25]:
b = @benchmark sum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.712 ms (0.00% GC)
  median time:      3.945 ms (0.00% GC)
  mean time:        4.151 ms (0.00% GC)
  maximum time:     7.600 ms (0.00% GC)
  --------------
  samples:          1202
  evals/sample:     1

In [26]:
d["Julia (built-in)"] = minimum(b.times) / 1e6

3.7117

In [27]:
x_any = Vector{Any}(x)
@benchmark sum($x_any)

BenchmarkTools.Trial: 
  memory estimate:  152.59 MiB
  allocs estimate:  9999999
  --------------
  minimum time:     202.654 ms (0.00% GC)
  median time:      219.493 ms (8.88% GC)
  mean time:        220.798 ms (7.07% GC)
  maximum time:     250.528 ms (8.91% GC)
  --------------
  samples:          23
  evals/sample:     1

## Hand-written Julia

In [28]:
function mysum1(A)
    s = zero(eltype(A)) # the correct type of zero for A
    for a in A
        s += a
    end
    return s
end

mysum1 (generic function with 1 method)

In [29]:
b = @benchmark mysum1($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     10.640 ms (0.00% GC)
  median time:      11.616 ms (0.00% GC)
  mean time:        11.845 ms (0.00% GC)
  maximum time:     19.081 ms (0.00% GC)
  --------------
  samples:          422
  evals/sample:     1

In [30]:
d["Julia (hand-written)"] = minimum(b.times) / 1e6

10.64

In [31]:
function mysum(A)
    s = zero(eltype(A)) # the correct type of zero for A
    @simd for a in A
        s += a
    end
    return s
end

mysum (generic function with 1 method)

In [32]:
b = @benchmark mysum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.737 ms (0.00% GC)
  median time:      4.113 ms (0.00% GC)
  mean time:        4.302 ms (0.00% GC)
  maximum time:     7.816 ms (0.00% GC)
  --------------
  samples:          1160
  evals/sample:     1

In [33]:
d["Julia (hand-written, simd)"] = minimum(b.times) / 1e6

3.7366

## Summary

In [34]:
for (key, value) in sort(collect(d), by=x->x[2])
    println(rpad(key, 30, "."), lpad(round(value, digits=2), 10, "."))
end

Julia (built-in)....................3.71
Julia (hand-written, simd)..........3.74
Python (numpy)......................9.79
Julia (hand-written)...............10.64
C..................................10.65
C++................................24.62
Python (built-in)..................35.04
Python (hand-written).............263.46
