# Optimizations

https://docs.julialang.org/en/v1/manual/performance-tips/

## Setup

In [16]:
using Pkg

In [19]:
pkg"add BenchmarkTools"

[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Manifest.toml`
[90m [no changes][39m


In [20]:
pkg"add PyCall"

[32m[1m Resolving[22m[39m package versions...
[32m[1m Installed[22m[39m PyCall ───────── v1.91.2
[32m[1m Installed[22m[39m MacroTools ───── v0.5.3
[32m[1m Installed[22m[39m DataStructures ─ v0.17.9
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Project.toml`
 [90m [438e738f][39m[92m + PyCall v1.91.2[39m
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Manifest.toml`
 [90m [864edb3b][39m[92m + DataStructures v0.17.9[39m
 [90m [1914dd2f][39m[92m + MacroTools v0.5.3[39m
 [90m [438e738f][39m[92m + PyCall v1.91.2[39m
[32m[1m  Building[22m[39m PyCall → `/opt/julia/packages/PyCall/ttONZ/deps/build.log`


In [21]:
pkg"update"

[32m[1m  Updating[22m[39m registry at `/opt/julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Manifest.toml`
[90m [no changes][39m


In [22]:
pkg"precompile"

[32m[1mPrecompiling[22m[39m project...
[32m[1mPrecompiling[22m[39m PyCall


┌ Info: Precompiling PyCall [438e738f-606a-5dbb-bf0a-cddfbfd45ab0]
└ @ Base loading.jl:1273


## Type Stability

In [23]:
using BenchmarkTools

### Basics

#### Global Variables

In [2]:
a = 2
function plusmulta_bad(n)
    res = 0
    for i=1:n
        res += i*a
    end
    res
end     

plusmulta_bad (generic function with 1 method)

In [3]:
@btime plusmulta_bad(1_000_000)

  108.224 ms (2999212 allocations: 45.76 MiB)


1000001000000

In [4]:
@code_warntype plusmulta_bad(1_000_000)

Variables
  #self#[36m::Core.Compiler.Const(plusmulta_bad, false)[39m
  n[36m::Int64[39m
  res[91m[1m::Any[22m[39m
  @_4[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[91m[1m::Any[22m[39m
[90m1 ─[39m       (res = 0)
[90m│  [39m %2  = (1:n)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_4 = Base.iterate(%2))
[90m│  [39m %4  = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_4::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = res[91m[1m::Any[22m[39m
[90m│  [39m %11 = (i * Main.a)[91m[1m::Any[22m[39m
[90m│  [39m       (res = %10 + %11)
[90m│  [39m       (@_4 = Base.iterate(%2, %9))
[90m│  [39m %14 = (@_4 === nothing)[36m::Bool

This function is really slow because the type of the global variable *a* is not fixed.

In [5]:
const a2 = 2
function plusmulta_good(n)
    res = 0
    for i=1:n
        res += i*a2
    end
    res
end     

plusmulta_good (generic function with 1 method)

Solution 1: make the global variable a constant.

In [6]:
@btime plusmulta_good(1_000_000)

  1.736 ns (0 allocations: 0 bytes)


1000001000000

In [7]:
@code_warntype plusmulta_good(1_000_000)

Variables
  #self#[36m::Core.Compiler.Const(plusmulta_good, false)[39m
  n[36m::Int64[39m
  res[36m::Int64[39m
  @_4[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[36m::Int64[39m
[90m1 ─[39m       (res = 0)
[90m│  [39m %2  = (1:n)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_4 = Base.iterate(%2))
[90m│  [39m %4  = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_4::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = res[36m::Int64[39m
[90m│  [39m %11 = (i * Main.a2)[36m::Int64[39m
[90m│  [39m       (res = %10 + %11)
[90m│  [39m       (@_4 = Base.iterate(%2, %9))
[90m│  [39m %14 = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %15 = 

In [8]:
@code_llvm plusmulta_good(1_000_000)


;  @ In[5]:3 within `plusmulta_good'
define i64 @julia_plusmulta_good_16640(i64) {
top:
;  @ In[5]:4 within `plusmulta_good'
; ┌ @ range.jl:5 within `Colon'
; │┌ @ range.jl:275 within `Type'
; ││┌ @ range.jl:280 within `unitrange_last'
; │││┌ @ operators.jl:341 within `>='
; ││││┌ @ int.jl:424 within `<='
       %1 = icmp sgt i64 %0, 0
; └└└└└
  br i1 %1, label %L7.L12_crit_edge, label %L29

L7.L12_crit_edge:                                 ; preds = %top
  %2 = shl i64 %0, 2
  %3 = add nsw i64 %0, -1
  %4 = add nsw i64 %0, -2
  %5 = mul i64 %3, %4
  %6 = and i64 %5, -2
  %7 = add i64 %2, %6
  %8 = add i64 %7, -2
;  @ In[5]:7 within `plusmulta_good'
  br label %L29

L29:                                              ; preds = %L7.L12_crit_edge, %top
  %value_phi9 = phi i64 [ 0, %top ], [ %8, %L7.L12_crit_edge ]
  ret i64 %value_phi9
}


Actually, the compiler optimized the for-loop away.

In [9]:
function plusmulta_good(n, a)
    res = 0
    for i=1:n
        res += i*a
    end
    res
end     

plusmulta_good (generic function with 2 methods)

In [10]:
@btime plusmulta_good(1_000_000, 2)

  1.736 ns (0 allocations: 0 bytes)


1000001000000

In [11]:
@code_llvm plusmulta_good(1_000_000, 2)


;  @ In[9]:2 within `plusmulta_good'
define i64 @julia_plusmulta_good_16655(i64, i64) {
top:
;  @ In[9]:3 within `plusmulta_good'
; ┌ @ range.jl:5 within `Colon'
; │┌ @ range.jl:275 within `Type'
; ││┌ @ range.jl:280 within `unitrange_last'
; │││┌ @ operators.jl:341 within `>='
; ││││┌ @ int.jl:424 within `<='
       %2 = icmp sgt i64 %0, 0
; └└└└└
  br i1 %2, label %L7.L12_crit_edge, label %L28

L7.L12_crit_edge:                                 ; preds = %top
  %3 = shl nuw i64 %0, 1
  %4 = add nsw i64 %0, -1
  %5 = zext i64 %4 to i65
  %6 = add nsw i64 %0, -2
  %7 = zext i64 %6 to i65
  %8 = mul i65 %5, %7
  %9 = lshr i65 %8, 1
  %10 = trunc i65 %9 to i64
  %11 = add i64 %3, %10
  %12 = add i64 %11, -1
  %13 = mul i64 %12, %1
;  @ In[9]:6 within `plusmulta_good'
  br label %L28

L28:                                              ; preds = %L7.L12_crit_edge, %top
  %value_phi9 = phi i64 [ 0, %top ], [ %13, %L7.L12_crit_edge ]
  ret i64 %value_phi9
}


We got the same good performance when using *a* as a method parameter.

Note that both methods (without and with *a* as parameter) are defined for the same function in this example, the concrete method is chosen according to call signature using multiple dispatch.

In [12]:
a = 2
randsum_bad(n) = begin # a rather unusual (and not recommended) way to define a function...
    res = 0
    for i = 1:n
        res += a*rand()
    end
    res
end

randsum_bad (generic function with 1 method)

In [13]:
@btime randsum_bad(1_000)

  92.300 μs (3000 allocations: 46.88 KiB)


1003.2320658465163

In [14]:
function randsum_better(n, a)
    res = 0
    for i = 1:n
        res += a*rand()
    end
    res
end

randsum_better (generic function with 1 method)

In [15]:
@btime randsum_better(1_000, $a)

  4.974 μs (0 allocations: 0 bytes)


988.3037837966322

This is a more "fair" comparison because the compiler cannot optimize the loop away. Still, type stability gives a performance improvement of a factor of 20.

#### Type Stability Inside Methods

Can we get better?

In [16]:
@code_warntype randsum_better(1_000, a)

Variables
  #self#[36m::Core.Compiler.Const(randsum_better, false)[39m
  n[36m::Int64[39m
  a[36m::Int64[39m
  res[91m[1m::Union{Float64, Int64}[22m[39m
  @_5[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[91m[1m::Union{Float64, Int64}[22m[39m
[90m1 ─[39m       (res = 0)
[90m│  [39m %2  = (1:n)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_5 = Base.iterate(%2))
[90m│  [39m %4  = (@_5 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_5::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = res[91m[1m::Union{Float64, Int64}[22m[39m
[90m│  [39m %11 = Main.rand()[36m::Float64[39m
[90m│  [39m %12 = (a * %11)[36m::Float64[39m
[90m│  [39m   

The variable *res* is still not type-stable. It is defined as integer, but the added random numbers are float.

Let's fix this:

In [17]:
function randsum_good(n, a)
    res = 0. # note the . which makes this a Float64 number
    for i = 1:n
        res += a*rand()
    end
    res
end

randsum_good (generic function with 1 method)

In [18]:
@code_warntype randsum_good(1_000, a)

Variables
  #self#[36m::Core.Compiler.Const(randsum_good, false)[39m
  n[36m::Int64[39m
  a[36m::Int64[39m
  res[36m::Float64[39m
  @_5[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[36m::Float64[39m
[90m1 ─[39m       (res = 0.0)
[90m│  [39m %2  = (1:n)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_5 = Base.iterate(%2))
[90m│  [39m %4  = (@_5 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_5::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = res[36m::Float64[39m
[90m│  [39m %11 = Main.rand()[36m::Float64[39m
[90m│  [39m %12 = (a * %11)[36m::Float64[39m
[90m│  [39m       (res = %10 + %12)
[90m│  [39m       (@_5 = Base.iterate(%2, %9)

In [19]:
@btime randsum_good(1_000, $a)

  3.621 μs (0 allocations: 0 bytes)


1039.1442226032727

An additional improvement of 30%.

### Custom Data Structures

In [20]:
abstract type MyDataTypes end

In [21]:
function fill_data!(data_array:: AbstractArray{T, 1}) where {T <: MyDataTypes}
    for i = 1:length(data_array)
        data = T(i, rand())
        data_array[i] = data
    end
end 

fill_data! (generic function with 1 method)

In [22]:
function aggregate_data(data_array:: AbstractArray{T, 1}) where {T <: MyDataTypes}
    res = zero(data_array[1].id * data_array[1].value)
    for i in eachindex(data_array)
        @inbounds row = data_array[i]
        res += row.id * row.value
    end
    res
end     

aggregate_data (generic function with 1 method)

#### Bad - Using of Abstract Types in Structures

In [23]:
struct MyBadData <: MyDataTypes
    id:: Integer
    value:: AbstractFloat
end

In [24]:
data_array_bad = Array{MyBadData, 1}(undef, 1_000)
@btime fill_data!(data_array_bad)

  34.172 μs (2489 allocations: 54.52 KiB)


In [25]:
@btime aggregate_data(data_array_bad)

  101.220 μs (2001 allocations: 31.27 KiB)


250448.92119289367

In [26]:
@code_warntype aggregate_data(data_array_bad)

Variables
  #self#[36m::Core.Compiler.Const(aggregate_data, false)[39m
  data_array[36m::Array{MyBadData,1}[39m
  res[91m[1m::Any[22m[39m
  @_4[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m
  val[36m::MyBadData[39m
  row[36m::MyBadData[39m

Body[91m[1m::Any[22m[39m
[90m1 ─[39m %1  = Base.getindex(data_array, 1)[36m::MyBadData[39m
[90m│  [39m %2  = Base.getproperty(%1, :id)[91m[1m::Integer[22m[39m
[90m│  [39m %3  = Base.getindex(data_array, 1)[36m::MyBadData[39m
[90m│  [39m %4  = Base.getproperty(%3, :value)[91m[1m::AbstractFloat[22m[39m
[90m│  [39m %5  = (%2 * %4)[91m[1m::Any[22m[39m
[90m│  [39m       (res = Main.zero(%5))
[90m│  [39m %7  = Main.eachindex(data_array)[36m::Base.OneTo{Int64}[39m
[90m│  [39m       (@_4 = Base.iterate(%7))
[90m│  [39m %9  = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %10 = Base.not_int(%9)[36m::Bool[39m
[90m└──[39m       goto #4 if not %10
[90m2 ┄[39m %12 = @_4::Tup

Using abstract data types inside user defined structures introduces a type instability which significantly reduces performance.

#### Using Concrete Types in Structures

In [27]:
struct MyGoodInflexibleData <: MyDataTypes
    id:: Int
    value:: Float64
end

In [28]:
data_array_good1 = Array{MyGoodInflexibleData, 1}(undef, 1_000)
@btime fill_data!(data_array_good1)

  5.250 μs (0 allocations: 0 bytes)


In [29]:
@btime aggregate_data(data_array_good1)

  1.472 μs (1 allocation: 16 bytes)


250311.52680232056

In [30]:
@code_warntype aggregate_data(data_array_good1)

Variables
  #self#[36m::Core.Compiler.Const(aggregate_data, false)[39m
  data_array[36m::Array{MyGoodInflexibleData,1}[39m
  res[36m::Float64[39m
  @_4[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m
  val[36m::MyGoodInflexibleData[39m
  row[36m::MyGoodInflexibleData[39m

Body[36m::Float64[39m
[90m1 ─[39m %1  = Base.getindex(data_array, 1)[36m::MyGoodInflexibleData[39m
[90m│  [39m %2  = Base.getproperty(%1, :id)[36m::Int64[39m
[90m│  [39m %3  = Base.getindex(data_array, 1)[36m::MyGoodInflexibleData[39m
[90m│  [39m %4  = Base.getproperty(%3, :value)[36m::Float64[39m
[90m│  [39m %5  = (%2 * %4)[36m::Float64[39m
[90m│  [39m       (res = Main.zero(%5))
[90m│  [39m %7  = Main.eachindex(data_array)[36m::Base.OneTo{Int64}[39m
[90m│  [39m       (@_4 = Base.iterate(%7))
[90m│  [39m %9  = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %10 = Base.not_int(%9)[36m::Bool[39m
[90m└──[39m       goto #4 if not %10
[90m2 ┄[39m 

Defining concrete data types inside a structure gives type-stability (and thus performace), but reduces flexibility - e.g. we cannot use Float32 as *value* anymore.

#### Parametric Types

In [31]:
struct MyGoodData{T <: Integer, U <: Number} <: MyDataTypes
    id:: T
    value:: U
end

In [32]:
data_array_good2 = Array{MyGoodData{Int, Float64}, 1}(undef, 1_000)
@btime fill_data!(data_array_good2)

  5.251 μs (0 allocations: 0 bytes)


In [33]:
@btime aggregate_data(data_array_good2)

  1.470 μs (1 allocation: 16 bytes)


254783.94285752467

In [34]:
@code_warntype aggregate_data(data_array_good2)

Variables
  #self#[36m::Core.Compiler.Const(aggregate_data, false)[39m
  data_array[36m::Array{MyGoodData{Int64,Float64},1}[39m
  res[36m::Float64[39m
  @_4[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m
  val[36m::MyGoodData{Int64,Float64}[39m
  row[36m::MyGoodData{Int64,Float64}[39m

Body[36m::Float64[39m
[90m1 ─[39m %1  = Base.getindex(data_array, 1)[36m::MyGoodData{Int64,Float64}[39m
[90m│  [39m %2  = Base.getproperty(%1, :id)[36m::Int64[39m
[90m│  [39m %3  = Base.getindex(data_array, 1)[36m::MyGoodData{Int64,Float64}[39m
[90m│  [39m %4  = Base.getproperty(%3, :value)[36m::Float64[39m
[90m│  [39m %5  = (%2 * %4)[36m::Float64[39m
[90m│  [39m       (res = Main.zero(%5))
[90m│  [39m %7  = Main.eachindex(data_array)[36m::Base.OneTo{Int64}[39m
[90m│  [39m       (@_4 = Base.iterate(%7))
[90m│  [39m %9  = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %10 = Base.not_int(%9)[36m::Bool[39m
[90m└──[39m       goto #4 

Parametric data types give both type-stability (and thus performance) and flexibility and are therefore usually the best solution.

## Allocations

## Further Optimizations

The following macros could give significant speed-ups in certain situations.
However, there is a good reason why these optimizations are not enabled by default, therefore use with caution.

### Baseline

In [42]:
my_array = rand(1_000_000)

1000000-element Array{Float64,1}:
 0.48251039775571636 
 0.6060967900875225  
 0.6433419971075629  
 0.5672518352733051  
 0.6330449556462303  
 0.7847472161059443  
 0.6346399388443404  
 0.9106312549030293  
 0.7672216770021818  
 0.016045060370933006
 0.002823502264714506
 0.04043181535589446 
 0.8424429448402266  
 ⋮                   
 0.20209203603669912 
 0.7235009637225687  
 0.7721670611958109  
 0.5345527557796439  
 0.29539609857093296 
 0.35311189336195103 
 0.30548114650768987 
 0.841808211370416   
 0.6500169780553795  
 0.41714463376159006 
 0.9968664190952854  
 0.3217207185034947  

In [43]:
function test_agg(array)
    res = 0.
    for i = 1:length(array)
        res += array[i]
    end
    res
end

test_agg (generic function with 1 method)

In [44]:
@btime test_agg($my_array)

  1.826 ms (0 allocations: 0 bytes)


499669.7343965981

### Deactivation of Bounds Checks

In [45]:
function test_agg_inbounds(array)
    res = 0.
    for i = 1:length(array)
        @inbounds res += array[i]
    end
    res
end

test_agg_inbounds (generic function with 1 method)

In [46]:
@btime test_agg_inbounds($my_array)

  1.687 ms (0 allocations: 0 bytes)


499669.7343965981

In [52]:
@assert test_agg(my_array) == test_agg_inbounds(my_array)

The *@inbounds* macro disables array boundary checks and gives a speedup of ca. 10% here.

However, be careful:

In [63]:
function test_agg_bugged(array)
    res = 0.
    for i = 1:length(array)+1 # bug: loop should go to length, not length + 1!
        res += array[i]
    end
    res
end

test_agg_bugged (generic function with 1 method)

In [64]:
test_agg_bugged(my_array)

BoundsError: BoundsError: attempt to access 1000000-element Array{Float64,1} at index [1000001]

In [65]:
function test_agg_inbounds_bugged(array)
    res = 0.
    for i = 1:length(array)+1 # bug: loop should go to length, not length + 1!
        @inbounds res += array[i]
    end
    res
end

test_agg_inbounds_bugged (generic function with 1 method)

In [69]:
test_agg_inbounds_bugged(my_array)

499669.7343965981

The bug in the code is not detected because of the inbounds macro.
The result of accessing an array out of bounds is not predictable.

### SIMD

This macro makes use the the Single Instruction Multiple Data functionality of modern CPUs.

It should only be used if the loop iterations are independent and the order of iterations can be changed.

In [76]:
function test_agg_simd(array)
    res = 0.
    @simd for i = 1:length(array)
        @inbounds res += array[i]
    end
    res
end

test_agg_simd (generic function with 1 method)

In [77]:
@btime test_agg_simd($my_array)

  1.227 ms (0 allocations: 0 bytes)


499669.7343966109

In [78]:
test_agg(my_array) - test_agg_simd(my_array)

-1.2747477740049362e-8

In [86]:
test_agg(my_array) ≈ test_agg_simd(my_array) # type: \approx <tab>

true

The @simd macro gives a speedup of 20%, but changes (slightly) the calculation result, likely due to modified order of loop elements.

In [79]:
@code_llvm test_agg_simd(my_array)


;  @ In[76]:2 within `test_agg_simd'
define double @julia_test_agg_simd_17291(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
;  @ In[76]:3 within `test_agg_simd'
; ┌ @ simdloop.jl:69 within `macro expansion'
; │┌ @ array.jl:200 within `length'
    %1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
    %2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
    %3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
    %4 = load i64, i64 addrspace(11)* %3, align 8
; │└
; │┌ @ range.jl:5 within `Colon'
; ││┌ @ range.jl:275 within `Type'
; │││┌ @ range.jl:280 within `unitrange_last'
; ││││┌ @ operators.jl:341 within `>='
; │││││┌ @ int.jl:424 within `<='
        %5 = icmp sgt i64 %4, 0
; ││││└└
      %6 = select i1 %5, i64 %4, i64 0
; │└└└
; │ @ simdloop.jl:71 within `macro expansion'
; │┌ @ simdloop.jl:51 within `simd_inner_length'
; ││┌ @ range.jl:541 within `length'
; │││┌ @ checked.jl:

Note the operations on data types like *<2x double>*.

In [83]:
function test_agg_simd_bad(array)
    res = 0.
    @simd for i = 1:length(array)
        res += array[i]
    end
    res
end

test_agg_simd_bad (generic function with 1 method)

In [84]:
@btime test_agg_simd_bad($my_array)

  1.837 ms (0 allocations: 0 bytes)


499669.7343965981

In [85]:
@code_llvm test_agg_simd_bad(my_array)


;  @ In[83]:2 within `test_agg_simd_bad'
define double @julia_test_agg_simd_bad_17346(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
;  @ In[83]:3 within `test_agg_simd_bad'
; ┌ @ simdloop.jl:69 within `macro expansion'
; │┌ @ array.jl:200 within `length'
    %1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
    %2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
    %3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
    %4 = load i64, i64 addrspace(11)* %3, align 8
; │└
; │┌ @ range.jl:5 within `Colon'
; ││┌ @ range.jl:275 within `Type'
; │││┌ @ range.jl:280 within `unitrange_last'
; ││││┌ @ operators.jl:341 within `>='
; │││││┌ @ int.jl:424 within `<='
        %5 = icmp sgt i64 %4, 0
; ││││└└
      %6 = select i1 %5, i64 %4, i64 0
; │└└└
; │ @ simdloop.jl:71 within `macro expansion'
; │┌ @ simdloop.jl:51 within `simd_inner_length'
; ││┌ @ range.jl:541 within `length'
; │││┌ @

Without the @inbounds macro, the array boundary checks prevent the simd optimizations - the benchmark shows no improvement w.r.t. the baseline.

## Test Case: Matrix Multiplication

In [2]:
n = 500
N = rand(n, n)
M = rand(n, n)

500×500 Array{Float64,2}:
 0.156731  0.6796     0.853618   …  0.872105   0.849378    0.0680059
 0.358122  0.328798   0.829952      0.906086   0.438232    0.23115  
 0.993626  0.526487   0.272948      0.538566   0.987162    0.0199736
 0.550159  0.76025    0.807789      0.533458   0.669581    0.715304 
 0.852079  0.942162   0.683549      0.077582   0.591801    0.95098  
 0.196827  0.640675   0.425311   …  0.0424427  0.98901     0.126323 
 0.165545  0.554632   0.552857      0.565712   0.00495803  0.972921 
 0.349934  0.299328   0.42443       0.510276   0.221542    0.636376 
 0.730736  0.477905   0.800342      0.822939   0.824897    0.636097 
 0.55802   0.812609   0.795543      0.0972319  0.118334    0.126863 
 0.387274  0.694672   0.728967   …  0.708464   0.244191    0.354581 
 0.196807  0.717834   0.769518      0.564327   0.708227    0.675514 
 0.592995  0.450996   0.638093      0.0778751  0.828478    0.754288 
 ⋮                               ⋱                                  
 0.83226

### Reference: OpenBLAS

OpenBLAS is a highly optimized library for linear algebra. Julia matrix multiplications use OpenBLAS as default.

In [3]:
using LinearAlgebra
BLAS.openblas_get_config()

"OpenBLAS 0.3.5  USE64BITINT DYNAMIC_ARCH NO_AFFINITY Atom MAX_THREADS=16"

In [4]:
@which N*M

In [5]:
BLAS.set_num_threads(4)
@btime $N*$M

  13.154 ms (2 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 124.848  123.635  125.802  122.014  …  121.236  122.371  121.662  121.049
 122.146  123.631  122.611  124.47      120.638  121.279  118.603  119.706
 128.689  125.308  126.183  124.783     125.063  124.972  126.531  119.959
 127.006  128.714  127.763  130.21      127.647  126.742  126.981  126.277
 118.159  120.993  120.368  116.243     117.775  117.035  120.326  118.36 
 119.597  115.836  119.892  115.21   …  118.544  121.697  120.096  115.935
 123.629  120.191  124.221  115.748     118.078  120.124  118.834  118.577
 124.925  124.226  127.184  122.156     125.857  126.295  123.786  120.916
 128.974  128.868  129.097  126.122     127.89   128.999  124.156  122.598
 123.55   123.931  122.707  122.618     122.158  125.453  119.884  121.837
 121.068  121.048  122.394  119.202  …  117.581  118.312  120.44   119.909
 123.734  124.959  122.711  123.078     118.507  126.199  122.914  118.429
 123.854  122.375  125.882  122.627     120.587  122.503  122.633  124.585

In [6]:
BLAS.set_num_threads(1)
@btime $N*$M
BLAS.set_num_threads(4)

  49.661 ms (2 allocations: 1.91 MiB)


### Implement Own Matrix Multiplication

#### First Iteration

In [135]:
function my_mult1(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    @assert size(A, 2) == size(B, 1)
    C = zeros(T, size(A,1), size(B,2))
    for m=1:size(A, 1), n=1:size(B, 2), k=1:size(A,2)
        C[m,n] += A[m,k]*B[k,n]
    end
    C
end

my_mult1 (generic function with 1 method)

In [136]:
@assert N*M ≈ my_mult1(N,M)
@btime my_mult1($N, $M)

  684.565 ms (2 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 127.1    127.448  125.48   123.864  …  118.352  131.542  122.332  129.782
 132.128  130.201  129.413  128.8       125.974  135.836  125.388  132.025
 130.231  123.881  125.301  123.95      121.464  131.338  120.117  126.382
 124.807  121.853  125.99   120.541     119.367  129.709  126.227  124.737
 116.927  117.9    119.265  117.267     115.623  121.237  115.533  118.444
 126.915  126.852  132.014  126.511  …  123.713  130.01   125.324  132.893
 128.23   129.15   125.654  123.774     119.361  129.986  124.139  129.694
 125.601  128.121  127.009  125.056     119.375  129.708  121.754  132.227
 123.635  123.911  123.799  122.375     118.105  127.535  118.843  124.107
 128.2    124.269  125.739  124.287     120.421  132.816  123.183  133.193
 121.276  126.084  122.185  121.759  …  117.487  127.389  120.534  125.527
 117.566  121.278  119.401  115.98      115.143  126.34   115.648  120.811
 129.288  127.363  128.213  125.14      123.053  130.595  124.913  130.13 

This is 50 times slower than OpenBlas.

#### Disable Bounds Checks

In [142]:
function my_mult2(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    @assert size(A, 2) == size(B, 1)
    C = zeros(T, size(A,1), size(B,2))
    for m=1:size(A, 1), n=1:size(B, 2), k=1:size(A,2)
        @inbounds C[m,n] += A[m,k]*B[k,n]
    end
    C
end

my_mult2 (generic function with 1 method)

In [143]:
@assert N*M ≈ my_mult2(N,M)
@btime my_mult2($N, $M)

  488.234 ms (2 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 127.1    127.448  125.48   123.864  …  118.352  131.542  122.332  129.782
 132.128  130.201  129.413  128.8       125.974  135.836  125.388  132.025
 130.231  123.881  125.301  123.95      121.464  131.338  120.117  126.382
 124.807  121.853  125.99   120.541     119.367  129.709  126.227  124.737
 116.927  117.9    119.265  117.267     115.623  121.237  115.533  118.444
 126.915  126.852  132.014  126.511  …  123.713  130.01   125.324  132.893
 128.23   129.15   125.654  123.774     119.361  129.986  124.139  129.694
 125.601  128.121  127.009  125.056     119.375  129.708  121.754  132.227
 123.635  123.911  123.799  122.375     118.105  127.535  118.843  124.107
 128.2    124.269  125.739  124.287     120.421  132.816  123.183  133.193
 121.276  126.084  122.185  121.759  …  117.487  127.389  120.534  125.527
 117.566  121.278  119.401  115.98      115.143  126.34   115.648  120.811
 129.288  127.363  128.213  125.14      123.053  130.595  124.913  130.13 

A bit better, "only" 45 times slower than OpenBLAS.

#### Re-Order For Loops to Match Memory Layout

In [7]:
function my_mult3(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    @assert size(A, 2) == size(B, 1)
    C = zeros(T, size(A,1), size(B,2))
    for n=1:size(B, 2), k=1:size(A,2), m=1:size(A, 1)
        @inbounds C[m,n] += A[m,k]*B[k,n]
    end
    C
end

my_mult3 (generic function with 1 method)

Arrays in Julia are column-major, i.e. the leftmost index should be in the innermost loop.

For this, the order of the for loops must be reverse to the order of indices in the matrix operations.

In [8]:
@assert N*M ≈ my_mult3(N,M)
@btime my_mult3($N, $M)

  180.692 ms (2 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 124.848  123.635  125.802  122.014  …  121.236  122.371  121.662  121.049
 122.146  123.631  122.611  124.47      120.638  121.279  118.603  119.706
 128.689  125.308  126.183  124.783     125.063  124.972  126.531  119.959
 127.006  128.714  127.763  130.21      127.647  126.742  126.981  126.277
 118.159  120.993  120.368  116.243     117.775  117.035  120.326  118.36 
 119.597  115.836  119.892  115.21   …  118.544  121.697  120.096  115.935
 123.629  120.191  124.221  115.748     118.078  120.124  118.834  118.577
 124.925  124.226  127.184  122.156     125.857  126.295  123.786  120.916
 128.974  128.868  129.097  126.122     127.89   128.999  124.156  122.598
 123.55   123.931  122.707  122.618     122.158  125.453  119.884  121.837
 121.068  121.048  122.394  119.202  …  117.581  118.312  120.44   119.909
 123.734  124.959  122.711  123.078     118.507  126.199  122.914  118.429
 123.854  122.375  125.882  122.627     120.587  122.503  122.633  124.585

Much better, only 14 times slower than OpenBlas (multi-threaded) or 4 times slower than single threaded OpenBlas.

#### SIMD

In [150]:
function my_mult4(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    @assert size(A, 2) == size(B, 1)
    C = zeros(T, size(A,1), size(B,2))
    for n=1:size(B, 2), k=1:size(A,2)
        @simd for m=1:size(A, 1)
            @inbounds C[m,n] += A[m,k]*B[k,n]
        end
    end
    C
end

my_mult4 (generic function with 1 method)

In [151]:
@assert N*M ≈ my_mult4(N,M)
@btime my_mult4($N, $M)

  180.917 ms (2 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 127.1    127.448  125.48   123.864  …  118.352  131.542  122.332  129.782
 132.128  130.201  129.413  128.8       125.974  135.836  125.388  132.025
 130.231  123.881  125.301  123.95      121.464  131.338  120.117  126.382
 124.807  121.853  125.99   120.541     119.367  129.709  126.227  124.737
 116.927  117.9    119.265  117.267     115.623  121.237  115.533  118.444
 126.915  126.852  132.014  126.511  …  123.713  130.01   125.324  132.893
 128.23   129.15   125.654  123.774     119.361  129.986  124.139  129.694
 125.601  128.121  127.009  125.056     119.375  129.708  121.754  132.227
 123.635  123.911  123.799  122.375     118.105  127.535  118.843  124.107
 128.2    124.269  125.739  124.287     120.421  132.816  123.183  133.193
 121.276  126.084  122.185  121.759  …  117.487  127.389  120.534  125.527
 117.566  121.278  119.401  115.98      115.143  126.34   115.648  120.811
 129.288  127.363  128.213  125.14      123.053  130.595  124.913  130.13 

No performance gain from adding @simd.
Why?

In [153]:
@code_llvm my_mult3(N,M)


;  @ In[144]:2 within `my_mult3'
define nonnull %jl_value_t addrspace(10)* @japi1_my_mult3_17903(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
  %gcframe = alloca %jl_value_t addrspace(10)*, i32 3
  %3 = bitcast %jl_value_t addrspace(10)** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* %3, i8 0, i32 24, i32 0, i1 false)
  %4 = alloca %jl_value_t addrspace(10)**, align 8
  store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %4, align 8
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"()
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -15560
  %ptls = bitcast i8* %ptls_i8 to %jl_value_t***
  %5 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 0
  %6 = bitcast %jl_value_t addrspace(10)** %5 to i64*
  store i64 2, i64* %6
  %7 = getelementptr %jl_value_t**, %jl_value_t*** %ptls, i32 0
  %8 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 1
  %9 = bitcast

In [152]:
@code_llvm my_mult4(N,M)


;  @ In[150]:2 within `my_mult4'
define nonnull %jl_value_t addrspace(10)* @japi1_my_mult4_17945(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
  %gcframe = alloca %jl_value_t addrspace(10)*, i32 3
  %3 = bitcast %jl_value_t addrspace(10)** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* %3, i8 0, i32 24, i32 0, i1 false)
  %4 = alloca %jl_value_t addrspace(10)**, align 8
  store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %4, align 8
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"()
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -15560
  %ptls = bitcast i8* %ptls_i8 to %jl_value_t***
  %5 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 0
  %6 = bitcast %jl_value_t addrspace(10)** %5 to i64*
  store i64 2, i64* %6
  %7 = getelementptr %jl_value_t**, %jl_value_t*** %ptls, i32 0
  %8 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 1
  %9 = bitcast

It looks like simd instructions are already used in the version without the @simd macro.

#### Multithreading

OpenBlas uses multiple threads for parallelization of matrix multiplications.
We should do the same for our implementation.

In [159]:
Threads.nthreads()

4

In [161]:
function my_mult5(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    @assert size(A, 2) == size(B, 1)
    C = zeros(T, size(A,1), size(B,2))
    Threads.@threads for n=1:size(B, 2)
        for k=1:size(A,2), m=1:size(A, 1)
            @inbounds C[m,n] += A[m,k]*B[k,n]
        end
    end
    C
end

my_mult5 (generic function with 1 method)

In [162]:
@assert N*M ≈ my_mult5(N,M)
@btime my_mult5($N, $M)

  53.037 ms (32 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 127.1    127.448  125.48   123.864  …  118.352  131.542  122.332  129.782
 132.128  130.201  129.413  128.8       125.974  135.836  125.388  132.025
 130.231  123.881  125.301  123.95      121.464  131.338  120.117  126.382
 124.807  121.853  125.99   120.541     119.367  129.709  126.227  124.737
 116.927  117.9    119.265  117.267     115.623  121.237  115.533  118.444
 126.915  126.852  132.014  126.511  …  123.713  130.01   125.324  132.893
 128.23   129.15   125.654  123.774     119.361  129.986  124.139  129.694
 125.601  128.121  127.009  125.056     119.375  129.708  121.754  132.227
 123.635  123.911  123.799  122.375     118.105  127.535  118.843  124.107
 128.2    124.269  125.739  124.287     120.421  132.816  123.183  133.193
 121.276  126.084  122.185  121.759  …  117.487  127.389  120.534  125.527
 117.566  121.278  119.401  115.98      115.143  126.34   115.648  120.811
 129.288  127.363  128.213  125.14      123.053  130.595  124.913  130.13 

After these optimizations, our own, very short, implementation is "only" 4 times slower than the highly specialized and optimized OpenBLAS.

#### LoopVectorization

In [178]:
using Pkg
Pkg.add("LoopVectorization")

[32m[1m Resolving[22m[39m package versions...
[32m[1m Installed[22m[39m SIMDPirates ─────── v0.1.1
[32m[1m Installed[22m[39m CpuId ───────────── v0.2.2
[32m[1m Installed[22m[39m SLEEFPirates ────── v0.1.1
[32m[1m Installed[22m[39m VectorizationBase ─ v0.1.4
[32m[1m Installed[22m[39m LoopVectorization ─ v0.1.3
[32m[1m Installed[22m[39m DataStructures ──── v0.17.7
[32m[1m Installed[22m[39m Colors ──────────── v0.11.2
[32m[1m Installed[22m[39m Distributions ───── v0.21.12
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.2/Project.toml`
 [90m [bdcacae8][39m[92m + LoopVectorization v0.1.3[39m
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.2/Manifest.toml`
 [90m [5ae59095][39m[93m ↑ Colors v0.11.1 ⇒ v0.11.2[39m
 [90m [adafc99b][39m[92m + CpuId v0.2.2[39m
 [90m [864edb3b][39m[93m ↑ DataStructures v0.17.6 ⇒ v0.17.7[39m
 [90m [31c24e10][39m[93m ↑ Distributions v0.21.11 ⇒ v0.21.12[39m
 [90m [bdcacae8][39m[92m + LoopVe

In [179]:
using LoopVectorization

┌ Info: Precompiling LoopVectorization [bdcacae8-1622-11e9-2a5c-532679323890]
└ @ Base loading.jl:1242


First try it single-threaded.

In [181]:
function my_mult6(A:: AbstractArray{T,2}, B:: AbstractArray{T,2}) where {T}
    @assert size(A, 2) == size(B, 1)
    C = zeros(T, size(A,1), size(B,2))
    @avx for n=1:size(B, 2), k=1:size(A,2), m=1:size(A, 1)
        C[m,n] += A[m,k]*B[k,n]
    end
    C
end

my_mult6 (generic function with 1 method)

In [182]:
@assert N*M ≈ my_mult6(N,M)
@btime my_mult6($N, $M)

  118.166 ms (2 allocations: 1.91 MiB)


500×500 Array{Float64,2}:
 127.1    127.448  125.48   123.864  …  118.352  131.542  122.332  129.782
 132.128  130.201  129.413  128.8       125.974  135.836  125.388  132.025
 130.231  123.881  125.301  123.95      121.464  131.338  120.117  126.382
 124.807  121.853  125.99   120.541     119.367  129.709  126.227  124.737
 116.927  117.9    119.265  117.267     115.623  121.237  115.533  118.444
 126.915  126.852  132.014  126.511  …  123.713  130.01   125.324  132.893
 128.23   129.15   125.654  123.774     119.361  129.986  124.139  129.694
 125.601  128.121  127.009  125.056     119.375  129.708  121.754  132.227
 123.635  123.911  123.799  122.375     118.105  127.535  118.843  124.107
 128.2    124.269  125.739  124.287     120.421  132.816  123.183  133.193
 121.276  126.084  122.185  121.759  …  117.487  127.389  120.534  125.527
 117.566  121.278  119.401  115.98      115.143  126.34   115.648  120.811
 129.288  127.363  128.213  125.14      123.053  130.595  124.913  130.13 

Performance is significantly better than for the previous single-threaded version (120 ms vs. 180 ms) and olny a factor of 2.4 slower than single-threaded OpenBlas.

However, multithreading seems not to work here.

## Profiling

### Built-in Profiler

In [10]:
using Profile

In [15]:
@profile my_mult3(N,M)
Profile.print()

471  ./task.jl:268; (::getfield(IJulia, Symbol("##15#1...
 471 ...F1GUo/src/eventloop.jl:8; eventloop(::ZMQ.Socket)
  471 ./essentials.jl:789; invokelatest
   471 ./essentials.jl:790; #invokelatest#1
    471 ...rc/execute_request.jl:67; execute_request(::ZMQ.Socket, ::I...
     471 ...c/SoftGlobalScope.jl:218; softscope_include_string(::Modu...
      471 ./boot.jl:330; eval
       3   ./In[12]:0; my_mult3(::Array{Float64,2}, ::...
       16  ./In[12]:4; my_mult3(::Array{Float64,2}, ::...
       431 ./In[12]:5; my_mult3(::Array{Float64,2}, ::...
        290 ./array.jl:729; getindex
        57  ./array.jl:768; setindex!
        84  ./float.jl:395; +
       5   ./array.jl:0; my_mult3(::Array{Float64,2}, ::...
1422 ./task.jl:327; task_done_hook(::Task)
 1422 ./task.jl:591; wait()
  1422 ./task.jl:564; poptaskref(::Base.InvasiveLinkedL...


### Traceur

In [19]:
using Pkg
Pkg.add("Traceur")

[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m


In [9]:
using Traceur

In [10]:
?@trace

```
@trace(functioncall(args...), maxdepth=2, modules=[])
```

Analyse `functioncall(args...)` for common performance problems and print them to the terminal.

Optional arguments:

  * `maxdepth` constrols how far Traceur recurses through the call stack.
  * If `modules` is nonempty, only warnings for methods defined in one of the modules specified will be printed.


In [12]:
@trace my_mult3(N[1:10,1:10],M[1:10,1:10])

└ @ multidimensional.jl:-1
└ @ multidimensional.jl:-1
└ @ multidimensional.jl:-1
└ @ multidimensional.jl:-1
└ @ multidimensional.jl:-1
└ @ multidimensional.jl:-1
└ @ array.jl:-1
└ @ array.jl:-1
└ @ In[7]:-1
└ @ In[7]:-1
└ @ In[7]:-1
└ @ In[7]:-1
└ @ In[7]:-1
└ @ In[7]:-1


10×10 Array{Float64,2}:
 2.57642  3.34419  3.52932  2.73674  …  1.58622   1.81356  4.20893  2.57049
 2.6925   2.92252  3.24841  2.77388     1.3328    1.86014  3.4557   3.11845
 1.9042   2.25962  2.36886  2.11691     0.971     1.36171  2.73181  2.6674 
 2.15174  2.98565  3.0704   2.29895     1.23425   1.45139  3.07028  1.88571
 1.95877  2.49404  2.72578  2.31636     1.15275   1.35725  3.07827  2.52607
 1.66516  1.91409  2.17439  1.72722  …  0.866641  1.0535   2.25487  1.9839 
 2.6037   2.85407  3.11888  2.71459     1.3147    2.04739  2.97024  2.48853
 2.52278  3.18912  3.2691   2.46799     1.50454   1.84537  3.82622  2.42145
 2.36727  3.11999  3.6394   2.71466     1.32336   1.36429  3.58355  3.00874
 2.0721   2.74922  2.84992  1.9043      1.29324   1.65696  3.23002  1.81846

## More testing on bound checks

In [2]:
using BenchmarkTools

In [3]:
function safe_loop(x)
    q = 0
    for i in eachindex(x)
        q += x[i]
    end
    q
end

safe_loop (generic function with 1 method)

In [4]:
x = [1, 2, 3, 4]
safe_loop(x)

10

In [5]:
@btime safe_loop($x)

  9.142 ns (0 allocations: 0 bytes)


10

In [6]:
@code_llvm safe_loop(x)


;  @ In[3]:2 within `safe_loop'
define i64 @julia_safe_loop_17789(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
;  @ In[3]:3 within `safe_loop'
; ┌ @ abstractarray.jl:212 within `eachindex'
; │┌ @ abstractarray.jl:95 within `axes1'
; ││┌ @ abstractarray.jl:75 within `axes'
; │││┌ @ array.jl:155 within `size'
      %1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
      %2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_value_t addrspace(10)* addrspace(11)*
      %3 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %2, i64 3
      %4 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %3 to i64 addrspace(11)*
      %5 = load i64, i64 addrspace(11)* %4, align 8
; │││└
; │││┌ @ tuple.jl:139 within `map'
; ││││┌ @ range.jl:320 within `OneTo' @ range.jl:311
; │││││┌ @ promotion.jl:412 within `max'
        %6 = icmp sgt i64 %5, 0
; └└└└└└
  br i1 %6, label %L8.L13_crit_edge, label %L29

L8.L13

In [7]:
t = (1,2,3,4)

(1, 2, 3, 4)

In [8]:
@btime safe_loop($t)

  2.064 ns (0 allocations: 0 bytes)


10

In [9]:
@code_llvm safe_loop(t)


;  @ In[3]:2 within `safe_loop'
define i64 @julia_safe_loop_17950([4 x i64] addrspace(11)* nocapture nonnull readonly dereferenceable(32)) {
L18:
;  @ In[3]:4 within `safe_loop'
; ┌ @ int.jl:53 within `+'
   %1 = bitcast [4 x i64] addrspace(11)* %0 to <4 x i64> addrspace(11)*
   %2 = load <4 x i64>, <4 x i64> addrspace(11)* %1, align 8
   %rdx.shuf = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx = add <4 x i64> %2, %rdx.shuf
   %rdx.shuf12 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %bin.rdx13 = add <4 x i64> %bin.rdx, %rdx.shuf12
   %3 = extractelement <4 x i64> %bin.rdx13, i32 0
; └
;  @ In[3]:6 within `safe_loop'
  ret i64 %3
}


In [10]:
function safe_loop_inbound(x)
    q = 0
    for i in eachindex(x)
        @inbounds q += x[i]
    end
    q
end

safe_loop_inbound (generic function with 1 method)

In [11]:
@btime safe_loop_inbound($x)

  7.767 ns (0 allocations: 0 bytes)


10

In [12]:
@code_llvm safe_loop_inbound(x)


;  @ In[10]:2 within `safe_loop_inbound'
define i64 @julia_safe_loop_inbound_17977(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
;  @ In[10]:3 within `safe_loop_inbound'
; ┌ @ abstractarray.jl:212 within `eachindex'
; │┌ @ abstractarray.jl:95 within `axes1'
; ││┌ @ abstractarray.jl:75 within `axes'
; │││┌ @ array.jl:155 within `size'
      %1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
      %2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_value_t addrspace(10)* addrspace(11)*
      %3 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %2, i64 3
      %4 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %3 to i64 addrspace(11)*
      %5 = load i64, i64 addrspace(11)* %4, align 8
; │││└
; │││┌ @ tuple.jl:139 within `map'
; ││││┌ @ range.jl:320 within `OneTo' @ range.jl:311
; │││││┌ @ promotion.jl:412 within `max'
        %6 = icmp sgt i64 %5, 0
; └└└└└└
  br i1 %6, label %L8.L13_cri

In [13]:
function unsafe_loop(x)
    q = 0
    for i in 1:4
        q += x[i]
    end
    q
end

unsafe_loop (generic function with 1 method)

In [14]:
unsafe_loop(x)

10

In [15]:
@code_llvm unsafe_loop(x)


;  @ In[13]:2 within `unsafe_loop'
define i64 @julia_unsafe_loop_17990(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
;  @ In[13]:4 within `unsafe_loop'
; ┌ @ array.jl:744 within `getindex'
   %1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
   %2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
   %3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
   %4 = load i64, i64 addrspace(11)* %3, align 8
   %5 = icmp eq i64 %4, 0
   br i1 %5, label %oob, label %idxend.lr.ph

idxend.lr.ph:                                     ; preds = %top
   %6 = bitcast %jl_value_t addrspace(11)* %1 to i64 addrspace(13)* addrspace(11)*
   %7 = load i64 addrspace(13)*, i64 addrspace(13)* addrspace(11)* %6, align 8
   br label %idxend

L17:                                              ; preds = %idxend
; └
; ┌ @ range.jl:598 within `iterate'
; │┌ @ int.jl:53 within `+'
    %8 = add nuw nsw i64 %value_p