In [2]:
using BenchmarkTools

In [19]:
a = zeros(100000)
b = rand(Int8, 100000)
c = rand(Int8, 100000)
d = rand(Int8, 100000)

100000-element Array{Int8,1}:
   90
    8
   27
  123
 -103
   98
  -51
  109
  -96
  100
 -101
  -25
   37
    ⋮
   -4
   64
  101
  -10
  -12
  -80
   32
  -43
 -116
   35
 -114
   25

In [20]:
function sum_vectors(a, b, c, d)
    n = length(a)
    for i in 1:n
        a[i] = b[i] + c[i] + d[i]
    end
end

sum_vectors (generic function with 1 method)

In [10]:
mutable struct particle
    x::Int8
end
@benchmark particles = Array{particle}(undef, 10000)

BenchmarkTools.Trial: 
  memory estimate:  78.20 KiB
  allocs estimate:  2
  --------------
  minimum time:     3.276 μs (0.00% GC)
  median time:      4.784 μs (0.00% GC)
  mean time:        7.513 μs (33.78% GC)
  maximum time:     5.444 ms (99.85% GC)
  --------------
  samples:          10000
  evals/sample:     8

In [13]:
particles = Array{particle}(undef, 10000)

10000-element Array{particle,1}:
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
   ⋮   
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef

In [11]:
@benchmark particles = [particle(0) for i in 1:10000]

BenchmarkTools.Trial: 
  memory estimate:  234.45 KiB
  allocs estimate:  10002
  --------------
  minimum time:     44.139 μs (0.00% GC)
  median time:      52.413 μs (0.00% GC)
  mean time:        69.416 μs (19.93% GC)
  maximum time:     44.029 ms (99.74% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [14]:
function generate_particle(particles)
    n = length(particles)
    for i in 1:n
        particles[i] = particle(0)
    end
    return particles
end
@benchmark generate_particle(particles)

BenchmarkTools.Trial: 
  memory estimate:  156.25 KiB
  allocs estimate:  10000
  --------------
  minimum time:     38.285 μs (0.00% GC)
  median time:      39.740 μs (0.00% GC)
  mean time:        53.794 μs (18.21% GC)
  maximum time:     39.398 ms (99.73% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [21]:
@benchmark sum_vectors(a, b, c, d)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     125.474 μs (0.00% GC)
  median time:      125.556 μs (0.00% GC)
  mean time:        136.913 μs (0.00% GC)
  maximum time:     987.026 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [22]:
@benchmark sum_vectors_simd(a, b, c, d)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     37.890 μs (0.00% GC)
  median time:      37.996 μs (0.00% GC)
  mean time:        40.743 μs (0.00% GC)
  maximum time:     534.254 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [12]:
function sum_vectors_simd(a, b, c, d)
    n = length(a)
    @inbounds @simd for i in 1:n
        a[i] = b[i] + c[i] + d[i]
    end
end

sum_vectors_simd (generic function with 1 method)

In [5]:
A = rand(100_000)
function simplesum(A)
    result = zero(eltype(A))
    for i in eachindex(A)
        result += A[i]
    end
    return result
end


function simdsum(A)
    result = zero(eltype(A))
    @inbounds @simd for i in eachindex(A)
        result += A[i]
    end
    return result
end




simdsum (generic function with 1 method)

In [6]:
@benchmark simplesum(A)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     100.302 μs (0.00% GC)
  median time:      100.337 μs (0.00% GC)
  mean time:        103.201 μs (0.00% GC)
  maximum time:     193.263 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [7]:
@benchmark simdsum(A)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     12.430 μs (0.00% GC)
  median time:      13.261 μs (0.00% GC)
  mean time:        14.238 μs (0.00% GC)
  maximum time:     66.388 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [8]:
simdsum(A)

49819.03557290002

#### @SIMD使ってみる

#### @SIMD使ってみる

\begin{align}
x_1 + y_1 &\rightarrow z_1 \\
x_2 + y_2 &\rightarrow z_2 \\
... \\
x_n + y_n &\rightarrow z_n
\end{align}

一度の命令にする

$$
\left(\begin{array}{cc}
x_1 \\
x_2 \\
... \\
x_n
\end{array}\right)
+
\left(\begin{array}{cc}
y_1 \\
y_2 \\
... \\
y_n
\end{array}\right)
\rightarrow
\left(\begin{array}{cc}
z_1 \\
z_2 \\
... \\
z_n
\end{array}\right)
$$

In [18]:
x = 0.0;
@simd for n = 1:10
  global x
  x += 1;
  x *= 2;
end
x

2046.0

In [154]:
A = rand(10^6)

function mysum(v::Vector{Float64})
    t = 0.0
    for x in v
        t += x
    end
    return t
end

function danger_zone(v::Vector{Float64})
    t = 0.0
    @inbounds @simd for x in v
        t += x
    end
    return t
end

danger_zone (generic function with 1 method)

In [155]:
sum(A)

500130.73179585056

In [156]:
mysum(A)

500130.7317958471

In [178]:
danger_zone(A)

500130.73179585143

In [53]:
one_to_n(1000000)==collect(1:1000000)

true

In [30]:
show_simd_sum(a)

i = 1
i = 2
i = 3
i = 4
i = 5
i = 6
i = 7
i = 8
i = 9
i = 10
i = 11
i = 12
i = 13
i = 14
i = 15
i = 16
i = 17
i = 18
i = 19
i = 20
i = 21
i = 22
i = 23
i = 24
i = 25
i = 26
i = 27
i = 28
i = 29
i = 30
i = 31
i = 32
i = 33
i = 34
i = 35
i = 36
i = 37
i = 38
i = 39
i = 40
i = 41
i = 42
i = 43
i = 44
i = 45
i = 46
i = 47
i = 48
i = 49
i = 50
i = 51
i = 52
i = 53
i = 54
i = 55
i = 56
i = 57
i = 58
i = 59
i = 60
i = 61
i = 62
i = 63
i = 64
i = 65
i = 66
i = 67
i = 68
i = 69
i = 70
i = 71
i = 72
i = 73
i = 74
i = 75
i = 76
i = 77
i = 78
i = 79
i = 80
i = 81
i = 82
i = 83
i = 84
i = 85
i = 86
i = 87
i = 88
i = 89
i = 90
i = 91
i = 92
i = 93
i = 94
i = 95
i = 96
i = 97
i = 98
i = 99
i = 100


5050