In [2]:
using BenchmarkTools

In [19]:
a = zeros(100000)
b = rand(Int8, 100000)
c = rand(Int8, 100000)
d = rand(Int8, 100000)

100000-element Array{Int8,1}:
   90
    8
   27
  123
 -103
   98
  -51
  109
  -96
  100
 -101
  -25
   37
    ⋮
   -4
   64
  101
  -10
  -12
  -80
   32
  -43
 -116
   35
 -114
   25

In [20]:
function sum_vectors(a, b, c, d)
    n = length(a)
    for i in 1:n
        a[i] = b[i] + c[i] + d[i]
    end
end

sum_vectors (generic function with 1 method)

In [10]:
mutable struct particle
    x::Int8
end
@benchmark particles = Array{particle}(undef, 10000)

BenchmarkTools.Trial: 
  memory estimate:  78.20 KiB
  allocs estimate:  2
  --------------
  minimum time:     3.276 μs (0.00% GC)
  median time:      4.784 μs (0.00% GC)
  mean time:        7.513 μs (33.78% GC)
  maximum time:     5.444 ms (99.85% GC)
  --------------
  samples:          10000
  evals/sample:     8

In [13]:
particles = Array{particle}(undef, 10000)

10000-element Array{particle,1}:
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
   ⋮   
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef
 #undef

In [11]:
@benchmark particles = [particle(0) for i in 1:10000]

BenchmarkTools.Trial: 
  memory estimate:  234.45 KiB
  allocs estimate:  10002
  --------------
  minimum time:     44.139 μs (0.00% GC)
  median time:      52.413 μs (0.00% GC)
  mean time:        69.416 μs (19.93% GC)
  maximum time:     44.029 ms (99.74% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [14]:
function generate_particle(particles)
    n = length(particles)
    for i in 1:n
        particles[i] = particle(0)
    end
    return particles
end
@benchmark generate_particle(particles)

BenchmarkTools.Trial: 
  memory estimate:  156.25 KiB
  allocs estimate:  10000
  --------------
  minimum time:     38.285 μs (0.00% GC)
  median time:      39.740 μs (0.00% GC)
  mean time:        53.794 μs (18.21% GC)
  maximum time:     39.398 ms (99.73% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [21]:
@benchmark sum_vectors(a, b, c, d)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     125.474 μs (0.00% GC)
  median time:      125.556 μs (0.00% GC)
  mean time:        136.913 μs (0.00% GC)
  maximum time:     987.026 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [22]:
@benchmark sum_vectors_simd(a, b, c, d)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     37.890 μs (0.00% GC)
  median time:      37.996 μs (0.00% GC)
  mean time:        40.743 μs (0.00% GC)
  maximum time:     534.254 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [12]:
function sum_vectors_simd(a, b, c, d)
    n = length(a)
    @inbounds @simd for i in 1:n
        a[i] = b[i] + c[i] + d[i]
    end
end

sum_vectors_simd (generic function with 1 method)

In [5]:
A = rand(100_000)
function simplesum(A)
    result = zero(eltype(A))
    for i in eachindex(A)
        result += A[i]
    end
    return result
end


function simdsum(A)
    result = zero(eltype(A))
    @inbounds @simd for i in eachindex(A)
        result += A[i]
    end
    return result
end




simdsum (generic function with 1 method)

In [6]:
@benchmark simplesum(A)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     100.302 μs (0.00% GC)
  median time:      100.337 μs (0.00% GC)
  mean time:        103.201 μs (0.00% GC)
  maximum time:     193.263 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [7]:
@benchmark simdsum(A)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     12.430 μs (0.00% GC)
  median time:      13.261 μs (0.00% GC)
  mean time:        14.238 μs (0.00% GC)
  maximum time:     66.388 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [8]:
simdsum(A)

49819.03557290002

#### @SIMD使ってみる

#### @SIMD使ってみる

\begin{align}
x_1 + y_1 &\rightarrow z_1 \\
x_2 + y_2 &\rightarrow z_2 \\
... \\
x_n + y_n &\rightarrow z_n
\end{align}

一度の命令にする

$$
\left(\begin{array}{cc}
x_1 \\
x_2 \\
... \\
x_n
\end{array}\right)
+
\left(\begin{array}{cc}
y_1 \\
y_2 \\
... \\
y_n
\end{array}\right)
\rightarrow
\left(\begin{array}{cc}
z_1 \\
z_2 \\
... \\
z_n
\end{array}\right)
$$

In [18]:
x = 0.0;
@simd for n = 1:10
  global x
  x += 1;
  x *= 2;
end
x

2046.0

In [25]:
a = collect(1:100)

function show_simd_sum(A)
    result = zero(eltype(A))
    @inbounds @simd for i in eachindex(A)
        result = result + A[i]
        @show i
    end
    return result
end


function show_sum(A)
    result = zero(eltype(A))
    for i in eachindex(A)
        result = result + A[i]
        @show result
    end
    return result
end


show_sum (generic function with 1 method)

In [26]:
show_sum(a)

result = 1
result = 3
result = 6
result = 10
result = 15
result = 21
result = 28
result = 36
result = 45
result = 55
result = 66
result = 78
result = 91
result = 105
result = 120
result = 136
result = 153
result = 171
result = 190
result = 210
result = 231
result = 253
result = 276
result = 300
result = 325
result = 351
result = 378
result = 406
result = 435
result = 465
result = 496
result = 528
result = 561
result = 595
result = 630
result = 666
result = 703
result = 741
result = 780
result = 820
result = 861
result = 903
result = 946
result = 990
result = 1035
result = 1081
result = 1128
result = 1176
result = 1225
result = 1275
result = 1326
result = 1378
result = 1431
result = 1485
result = 1540
result = 1596
result = 1653
result = 1711
result = 1770
result = 1830
result = 1891
result = 1953
result = 2016
result = 2080
result = 2145
result = 2211
result = 2278
result = 2346
result = 2415
result = 2485
result = 2556
result = 2628
result = 2701
result = 2775
result = 2850
result = 2

5050

In [27]:
show_simd_sum(a)

result = 1
result = 3
result = 6
result = 10
result = 15
result = 21
result = 28
result = 36
result = 45
result = 55
result = 66
result = 78
result = 91
result = 105
result = 120
result = 136
result = 153
result = 171
result = 190
result = 210
result = 231
result = 253
result = 276
result = 300
result = 325
result = 351
result = 378
result = 406
result = 435
result = 465
result = 496
result = 528
result = 561
result = 595
result = 630
result = 666
result = 703
result = 741
result = 780
result = 820
result = 861
result = 903
result = 946
result = 990
result = 1035
result = 1081
result = 1128
result = 1176
result = 1225
result = 1275
result = 1326
result = 1378
result = 1431
result = 1485
result = 1540
result = 1596
result = 1653
result = 1711
result = 1770
result = 1830
result = 1891
result = 1953
result = 2016
result = 2080
result = 2145
result = 2211
result = 2278
result = 2346
result = 2415
result = 2485
result = 2556
result = 2628
result = 2701
result = 2775
result = 2850
result = 2

5050