In [45]:
addprocs(4)

4-element Array{Int64,1}:
 5
 6
 7
 8

In [46]:
@everywhere function myrange(q::SharedArray)
    idx = indexpids(q)
    if idx == 0
        # This worker is not assigned a piece
        return 1:0, 1:0
    end
    nchunks = length(procs(q))
    splits = [round(Int, s) for s in linspace(0,size(q,2),nchunks+1)]
    1:size(q,1), splits[idx]+1:splits[idx+1]
end

# Here's the kernel
@everywhere function advection_chunk!(q, u, irange, jrange)
    @show (irange, jrange)  # display so we can see what's happening
    for j in jrange, i in irange
        q[i,j] = q[i,j] +  u[i,j]
    end
    q
end

# Here's a convenience wrapper for a SharedArray implementation
@everywhere advection_shared_chunk!(q, u) = advection_chunk!(q, u, myrange(q)...)

In [47]:
advection_serial!(q, u) = advection_chunk!(q, u, 1:size(q,1), 1:size(q,2))

advection_serial! (generic function with 1 method)

In [None]:
function advection_parallel!(q, u)
       @sync @parallel for j = 1:size(q,2)
            for i = 1:size(q,1)
                q[i,j]= q[i,j] + u[i,j]
            end
        end
    q
end

In [48]:
function advection_shared!(q, u)
    @sync begin
        for p in procs(q)
            @async remotecall_wait(advection_shared_chunk!, p, q, u)
        end
    end
    q
end

advection_shared! (generic function with 1 method)

In [None]:
addprocs(2)

In [80]:
q = SharedArray(Float64, (5,10000))
u = SharedArray(Float64, (5,10000))
advection_serial!(q,u)
advection_shared!(q,u)
#@time advection_serial!(q, u)

(irange,jrange) = (1:5,1:10000)
	From worker 2:	(irange,jrange) = (1:5,1:1429)
	From worker 3:	(irange,jrange) = (1:5,1430:2857)
	From worker 5:	(irange,jrange) = (1:5,4287:5714)
	From worker 7:	(irange,jrange) = (1:5,7144:8571)
	From worker 8:	(irange,jrange) = (1:5,8572:10000)
	From worker 4:	(irange,jrange) = (1:5,2858:4286)


5x10000 SharedArray{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

	From worker 6:	(irange,jrange) = (1:5,5715:7143)


In [83]:
@time advection_serial!(q, u)

(irange,jrange) = 

5x10000 SharedArray{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

(1:5,1:10000)
  0.000565 seconds (335 allocations: 15.891 KB)


In [84]:
# Run once to JIT-compile
@time advection_shared!(q,u)

	From worker 2:	(irange,jrange) = (1:5,1:1429)
	From worker 4:	(irange,jrange) = (1:5,2858:4286)
	From worker 3:	(irange,jrange) = (1:5,1430:2857)
	From worker 5:	(irange,jrange) = (1:5,4287:5714)


5x10000 SharedArray{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

	From worker 6:	(irange,jrange) = (1:5,5715:7143)
	From worker 7:	(irange,jrange) = (1:5,7144:8571)
	From worker 8:	(irange,jrange) = (1:5,8572:10000)
  0.034747 seconds (8.04 k allocations: 605.375 KB)


In [2]:
0.034747/0.000565

61.499115044247795