### Using SharedArray is costly in terms of memory

In [1]:
addprocs(4)

4-element Array{Int64,1}:
 2
 3
 4
 5

#### I added 4 workers and followed official parallel documentation

In [2]:
@everywhere function myrange(q::SharedArray)
    idx = indexpids(q)
    if idx == 0
        # This worker is not assigned a piece
        return 1:0, 1:0
    end
    nchunks = length(procs(q))
    splits = [round(Int, s) for s in linspace(0,size(q,2),nchunks+1)]
    1:size(q,1), splits[idx]+1:splits[idx+1]
end

# Here's the kernel
@everywhere function advection_chunk!(q, u, irange, jrange, trange)
    @show (irange, jrange, trange)  # display so we can see what's happening
    for t in trange, j in jrange, i in irange
        q[i,j,t+1] = q[i,j,t] +  u[i,j,t]
    end
    q
end

# Here's a convenience wrapper for a SharedArray implementation
@everywhere advection_shared_chunk!(q, u) = advection_chunk!(q, u, myrange(q)..., 1:size(q,3)-1)

#### Serial code

In [3]:
advection_serial!(q, u) = advection_chunk!(q, u, 1:size(q,1), 1:size(q,2), 1:size(q,3)-1)

advection_serial! (generic function with 1 method)

#### Parallel code

In [4]:
function advection_shared!(q, u)
    @sync begin
        for p in procs(q)
            @async remotecall_wait(advection_shared_chunk!, p, q, u)
        end
    end
    q
end

advection_shared! (generic function with 1 method)

In [5]:
q = SharedArray(Float64, (500,500,500))
u = SharedArray(Float64, (500,500,500))

# Run once to JIT-compile
advection_serial!(q, u);
advection_shared!(q, u);

(irange,jrange,trange) = (1:500,1:500,1:499)
	From worker 2:	(irange,jrange,trange) = (1:500,1:125,1:499)
	From worker 5:	(irange,jrange,trange) = (1:500,376:500,1:499)
	From worker 3:	(irange,jrange,trange) = (1:500,126:250,1:499)
	From worker 4:	(irange,jrange,trange) = (1:500,251:375,1:499)


In [7]:
@time advection_serial!(q, u);

(irange,jrange,trange) = (1:500,1:500,1:499)
  0.753682 seconds (350 allocations: 16.625 KB)


In [9]:
@time advection_shared!(q, u);

	From worker 2:	(irange,jrange,trange) = (1:500,1:125,1:499)
	From worker 3:	(irange,jrange,trange) = (1:500,126:250,1:499)
	From worker 4:	(irange,jrange,trange) = (1:500,251:375,1:499)
	From worker 5:	(irange,jrange,trange) = (1:500,376:500,1:499)
  0.312011 seconds (3.37 k allocations: 244.141 KB)


#### Documentation shows that parallel code runs faster than serial code. But we don't need to use SharedArray for serial code!

In [10]:
q = Array(Float64, (500,500,500))
u = Array(Float64, (500,500,500));
advection_serial!(q, u);

(irange,jrange,trange) = (1:500,1:500,1:499)


In [13]:
@time advection_serial!(q, u);

(irange,jrange,trange) = (1:500,1:500,1:499)
  0.310071 seconds (350 allocations: 16.625 KB)


#### Serial code runs faster than parallel code! Memory allocation is much smaller.