In [169]:
import Base.Threads: AbstractLock, TatasLock, lock!, trylock!, unlock!, @threads, nthreads, threadid

In [2]:
macro with_lock(lock, body)
    quote
        lock!($lock)
        try
            $body
        finally
            unlock!($lock)
        end
    end
end
macro try_with_lock(lock, successbody, failbody)
    quote
        if trylock!($lock) == 0 #Trylock returns 0 on true.
            try
                $successbody
            finally
                unlock!($lock)
            end
        else
            $failbody
        end
    end
end
    


@try_with_lock (macro with 1 method)

In [3]:
type CircQueue{T}
    buffer::Vector{T}
    front::Int64
    back::Int64
    lock::TatasLock
    
    CircQueue(len::Int64) = new(Vector{T}(len+1),len+1,len+1,TatasLock())
end

immutable QueueFullError <: Exception
end
immutable QueueEmptyError <: Exception
end


In [4]:
wrap_index(ii::Int64, len) = mod(ii - 1, len) + 1
wrap_index(ii::Int64, q::CircQueue) = wrap_index(ii, length(q.buffer))


function _empty(q::CircQueue)
    q.front==q.back 
end

function _full(q::CircQueue)
    q.front==wrap_index(q.back-1 ,q)
end

function _enqueue!{T}(q::CircQueue{T}, v::T)
    if _full(q)
        throw(QueueFullError())
    end
    q.buffer[q.back] = v
    q.back = wrap_index(q.back-1, q)
    q
end

function enqueue!{T}(q::CircQueue{T}, v::T)
    @with_lock(q.lock, _enqueue!(q,v))
end

function try_enqueue!{T}(q::CircQueue{T}, v::T) ::Bool
    @try_with_lock(q.lock, (_enqueue!(q,v); true), false)
end


function _dequeue!{T}(q::CircQueue{T}) :: T
        if _empty(q)
        throw(QueueEmptyError())
    end
    len = length(q.buffer)
    
    ret = q.buffer[q.front]
    q.front=wrap_index(q.front-1, q)   
    ret
end

function dequeue!{T}(q::CircQueue{T}) ::T
    @with_lock(q.lock, _dequeue!(q))
end

function try_dequeue!{T}(q::CircQueue{T}) ::Nullable{T}
    @try_with_lock(q.lock, Nullable{T}(_dequeue!(q)), Nullable{T}())
end

try_dequeue! (generic function with 1 method)

In [5]:
using Base.Test

In [6]:
q= CircQueue{Int64}(20)
enqueue!(q, 10)
@test dequeue!(q) == 10 

q= CircQueue{Int64}(5)
enqueue!(q, 10)
enqueue!(q, 20)
enqueue!(q, 30)
@test dequeue!(q) == 10 
@test dequeue!(q) == 20 
@test dequeue!(q) == 30


q= CircQueue{Int64}(5)
enqueue!(q, 10)
enqueue!(q, 20)
@test dequeue!(q) == 10 
enqueue!(q, 30)
@test dequeue!(q) == 20 
@test dequeue!(q) == 30

q=CircQueue{Int64}(2)
@test_throws(QueueEmptyError, dequeue!(q))

q=CircQueue{Int64}(2)
enqueue!(q, 10)
enqueue!(q, 20)
@test_throws(QueueFullError, enqueue!(q, 30))


q=CircQueue{Int64}(10)
@test try_enqueue!(q, 20) == true
@test get(try_dequeue!(q)) == 20
enqueue!(q,10)
lock!(q.lock)
@test try_enqueue!(q, 20) == false
@test isnull(try_dequeue!(q))
unlock!(q.lock)
@test dequeue!(q) == 10 


q=CircQueue{Int64}(3)
enqueue!(q, 10)
enqueue!(q, 20)
enqueue!(q, 30)
#It is now full
lock!(q.lock)
@test try_enqueue!(q, 20) == false
unlock!(q.lock)

0

In [112]:
Base.libllvm_version

"3.7.1"

In [122]:
import Base.Threads: Atomic, atomic_add!, @threads
import Core.Intrinsics: llvmcall
import Base: unsafe_convert

In [149]:
import Base.Threads: atomictypes, llvmtypes,inttype
for typ in atomictypes
    lt = llvmtypes[typ]
    
    rmwop = :inc
    rmw = string(rmwop)
    fn = Symbol("atomic_", rmw, "!")
    if typ <: Integer
        @eval $fn(x::Atomic{$typ}, v::$typ) =
            llvmcall($"""
                     %rv = atomicrmw $rmw $lt* %0, $lt 1 acq_rel
                     ret $lt %rv
                     """, $typ, Tuple{Ptr{$typ}}, unsafe_convert(Ptr{$typ}, x))
    end
end


In [150]:
function s(a)
    next_i = a 
    next_i+=1
end
@code_llvm s(1)


define i64 @julia_s_69737(i64) #0 {
top:
  %1 = add i64 %0, 1
  ret i64 %1
}


In [160]:
?unsafe_convert

search:



```
unsafe_convert(T,x)
```

Convert `x` to a value of type `T`

In cases where `convert` would need to take a Julia object and turn it into a `Ptr`, this function should be used to define and perform that conversion.

Be careful to ensure that a Julia reference to `x` exists as long as the result of this function will be used. Accordingly, the argument `x` to this function should never be an expression, only a variable name or field reference. For example, `x=a.b.c` is acceptable, but `x=[a,b,c]` is not.

The `unsafe` prefix on this function indicates that using the result of this function after the `x` argument to this function is no longer accessible to the program may cause undefined behavior, including program corruption or segfaults, at any later time.


In [162]:
function t(next_i)
    atomic_monotonic_inc!(next_i) #Read current value, and then increment
end
a  = Atomic{Int32}(1)
@code_llvm t(a)


define %jl_value_t* @julia_t_69818(%jl_value_t*) #0 {
top:
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
  %ptls = bitcast i8* %ptls_i8 to %jl_value_t***
  %1 = alloca [4 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [4 x %jl_value_t*], [4 x %jl_value_t*]* %1, i64 0, i64 0
  %2 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %1, i64 0, i64 2
  %3 = bitcast %jl_value_t** %2 to i8*
  call void @llvm.memset.p0i8.i32(i8* %3, i8 0, i32 16, i32 8, i1 false)
  %4 = bitcast [4 x %jl_value_t*]* %1 to i64*
  store i64 4, i64* %4, align 8
  %5 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %1, i64 0, i64 1
  %6 = bitcast i8* %ptls_i8 to i64*
  %7 = load i64, i64* %6, align 8
  %8 = bitcast %jl_value_t** %5 to i64*
  store i64 %7, i64* %8, align 8
  store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
  %9 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %1, i64 0, i64 3
  store %jl_value_

In [124]:
function _direct_threadsfor(iter,lbody)
    fun = gensym("_direct_threadsfor")
    lidx = iter.args[1]         # index
    range = iter.args[2]
    quote
        next_i = Atomic{Int32}(1)
        function $fun()
            r = $(esc(range))
            # run this thread's iterations
            while(true)
                i = atomic_monotonic_add!(next_i,one(Int32)) #Read current value, and then increment
                i >  length(r) && break 
                
                local $(esc(lidx)) = Base.unsafe_getindex(r,i)
                #local $(esc(lidx)) = i
                $(esc(lbody))
                
            end
        end
        $fun()
        #ccall(:jl_threading_run, Void, (Any,), Core.svec($fun))
    end
end

macro direct_threads(args...)
    na = length(args)
    if na != 1
        throw(ArgumentError("wrong number of arguments in @direct_threads"))
    end
    ex = args[1]
    if !isa(ex, Expr)
        throw(ArgumentError("need an expression argument to @direct_threads"))
    end
    if is(ex.head, :for)
        return _direct_threadsfor(ex.args[1],ex.args[2])
    else
        throw(ArgumentError("unrecognized argument to @direct_threads"))
    end
end

@direct_threads (macro with 1 method)

In [208]:
range

1:104

In [212]:
function _ot_threadsfor(iter,lbody,leftover_units = 1)
    fun = gensym("_ot_threadsfor")
    lidx = iter.args[1]         # index
    range = iter.args[2]
    quote
        len, rem = divrem(length($(esc(range))), nthreads()+$(esc(leftover_units)))
        num_leftover = leftover_units*len + rem
        left_over_start = len*nthreads() + 1
        left_over_end = len*nthreads() + num_leftover
        
        next_i = Atomic{Int}(left_over_start)
        function $fun()
            r = $(esc(range))
            tid = threadid()
            f = 1 + ((tid-1) * len)
            l = f + len - 1
            # run this thread's iterations
            for i in f:l
                local $(esc(lidx)) = Base.unsafe_getindex(r,i)
                $(esc(lbody))
            end
            #Work through left overs
            while(true)
                i = atomic_add!(next_i,one(Int)) #Read current value, and then increment
                i >  left_over_end && break 
                
                local $(esc(lidx)) = Base.unsafe_getindex(r,i)
                #local $(esc(lidx)) = i
                $(esc(lbody))
                
            end
        end
        ccall(:jl_threading_run, Void, (Any,), Core.svec($fun))
    end
end

macro ot_threads(args...)
    na = length(args)
    if na != 1
        throw(ArgumentError("wrong number of arguments in @ot_threads"))
    end
    ex = args[1]
    if !isa(ex, Expr)
        throw(ArgumentError("need an expression argument to @ot_threads"))
    end
    if is(ex.head, :for)
        return _ot_threadsfor(ex.args[1],ex.args[2])
    else
        throw(ArgumentError("unrecognized argument to @ot_threads"))
    end
end

@ot_threads (macro with 1 method)

In [213]:
function count_test_normal()
    len=331
    rets = Vector{Float64}(len)
    for ii in 1:len
        rets[ii]=ii
    end
    rets
end

function count_test_threads()
    len=331
    rets = Vector{Float64}(len)
    @threads for ii in 1:len
        rets[ii]=ii
    end
    rets
end

function count_test_directthreads()
    len=331
    rets = Vector{Float64}(len)
    @direct_threads for ii in 1:len
        rets[ii]=ii
    end
    rets
end


function count_test_otthreads()
    len=331
    rets = Vector{Float64}(len)
    @ot_threads for ii in 1:len
        rets[ii]=ii
    end
    rets
end
[count_test_normal() count_test_threads() count_test_directthreads() count_test_otthreads()]

331×4 Array{Float64,2}:
   1.0    1.0    1.0    1.0
   2.0    2.0    2.0    2.0
   3.0    3.0    3.0    3.0
   4.0    4.0    4.0    4.0
   5.0    5.0    5.0    5.0
   6.0    6.0    6.0    6.0
   7.0    7.0    7.0    7.0
   8.0    8.0    8.0    8.0
   9.0    9.0    9.0    9.0
  10.0   10.0   10.0   10.0
  11.0   11.0   11.0   11.0
  12.0   12.0   12.0   12.0
  13.0   13.0   13.0   13.0
  14.0   14.0   14.0   14.0
  15.0   15.0   15.0   15.0
  16.0   16.0   16.0   16.0
  17.0   17.0   17.0   17.0
  18.0   18.0   18.0   18.0
  19.0   19.0   19.0   19.0
  20.0   20.0   20.0   20.0
  21.0   21.0   21.0   21.0
  22.0   22.0   22.0   22.0
  23.0   23.0   23.0   23.0
  24.0   24.0   24.0   24.0
  25.0   25.0   25.0   25.0
  26.0   26.0   26.0   26.0
  27.0   27.0   27.0   27.0
  28.0   28.0   28.0   28.0
  29.0   29.0   29.0   29.0
  30.0   30.0   30.0   30.0
  31.0   31.0   31.0   31.0
  32.0   32.0   32.0   32.0
  33.0   33.0   33.0   33.0
  34.0   34.0   34.0   34.0
  35.0   35.0   35.0   3

In [214]:
function ubperf_test_normal()
    len = 1000
    rets = Vector{Float64}(len)
    times = Vector{Float64}(len)
    
       
    for ii in 1:len
        rets[ii]=0
        timed = @timed for j in 1:8^(ii%8) 
            rets[ii]+=gcd(j,ii-1)
        end
        times[ii] = timed[2]
    end
    rets, times
end

function ubperf_test_threads()
    len = 1000
    rets = Vector{Float64}(len)
    times = Vector{Float64}(len)
    
    @threads for ii in 1:len
        rets[ii]=0
        timed = @timed for j in 1:8^(ii%8) 
            rets[ii]+=gcd(j,ii-1)
        end
        times[ii] = timed[2]
    end
    rets, times
end

function ubperf_test_directthreads()
    len = 1000
    rets = Vector{Float64}(len)
    times = Vector{Float64}(len)
    
       
    @direct_threads for ii in 1:len
        rets[ii]=0
        timed = @timed for j in 1:8^(ii%8) 
            rets[ii]+=gcd(j,ii-1)
        end
        times[ii] = timed[2]
    end
    rets, times
end

function ubperf_test_otthreads()
    len = 1000
    rets = Vector{Float64}(len)
    times = Vector{Float64}(len)
    
       
    @ot_threads for ii in 1:len
        rets[ii]=0
        timed = @timed for j in 1:8^(ii%8) 
            rets[ii]+=gcd(j,ii-1)
        end
        times[ii] = timed[2]
    end
    rets, times
end

ubperf_test_otthreads (generic function with 1 method)

In [216]:
gc()
normal1, normal1_t=@time ubperf_test_normal()
gc()
threads1, threads1_t=@time ubperf_test_threads()
gc()
directthreads1, directthreads1_t = @time ubperf_test_directthreads()
otthreads1, otthreads1_t = @time ubperf_test_otthreads()
println("---")
gc()
normal2, normal2_t=@time ubperf_test_normal()
gc()
threads2, threads2_t=@time ubperf_test_threads()
gc()
directthreads2, directthreads2_t = @time ubperf_test_directthreads()
otthreads2, otthreads2_t = @time ubperf_test_otthreads()
println("---")
gc()
normal3, normal3_t=@time ubperf_test_normal()
gc()
threads3, threads3_t=@time ubperf_test_threads()
gc()
directthreads3, directthreads3_t = @time ubperf_test_directthreads()
ototthreads3, otthreads3_t = @time ubperf_test_otthreads()
println("---")

normal3 == normal2 == normal1 == threads1 == threads2 == threads3 == directthreads1 == directthreads2 == directthreads3 == otthreads1 == otthreads2 == otthreads3 

 18.150811 seconds (7 allocations: 16.063 KB)
  2.557432 seconds (9 allocations: 16.125 KB)
 44.982111 seconds (9 allocations: 16.125 KB)
  4.606303 seconds (234 allocations: 19.672 KB)
---
 51.565099 seconds (7 allocations: 16.063 KB)
  7.159351 seconds (9 allocations: 16.125 KB)
 26.438624 seconds (9 allocations: 16.125 KB)
  2.897004 seconds (234 allocations: 19.672 KB)
---
 47.340607 seconds (7 allocations: 16.063 KB)
  5.034009 seconds (9 allocations: 16.125 KB)
 82.163128 seconds (9 allocations: 16.125 KB)
  1.129753 seconds (234 allocations: 19.672 KB)
---


true

In [101]:
gc()
normal1, normal1_t=@time ubperf_test_normal()
gc()
normal2, normal2_t=@time ubperf_test_normal()
gc()
normal3, normal3_t=@time ubperf_test_normal()

normal3 == normal2 == normal1

 19.725980 seconds (7 allocations: 16.063 KB)
 17.836339 seconds (7 allocations: 16.063 KB)
 37.605430 seconds (7 allocations: 16.063 KB)


true

In [111]:
function ubperf_test_normal!(len=1000, rets = Vector{Float64}(len), times = Vector{Float64}(len))
   
       
    for ii in 1:len
        rets[ii]=0
        timed = @timed for j in 1:8^(ii%8) 
            rets[ii]+=gcd(j,ii-1)
        end
        times[ii] = timed[2]
    end
    rets, times
end

len=1000;

rets = Vector{Float64}(len);
times = Vector{Float64}(len)
gc()
normal1, normal1_t=@time ubperf_test_normal!(len, rets,times)

rets = Vector{Float64}(len);
times = Vector{Float64}(len)
gc()
normal2, normal2_t=@time ubperf_test_normal!(len, rets,times)

rets = Vector{Float64}(len);
times = Vector{Float64}(len)
gc()
normal3, normal3_t=@time ubperf_test_normal!(len, rets,times)

normal3 == normal2 == normal1

 18.827280 seconds (6.36 k allocations: 273.581 KB)
 09.852547 seconds (5 allocations: 192 bytes)
 18.366248 seconds (5 allocations: 192 bytes)


true

In [106]:
@code_native ubperf_test_normal()

	.text
Filename: In[98]
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	subq	$680, %rsp              # imm = 0x2A8
	movabsq	$jl_hrtime, %r12
	movabsq	$140276862454416, %r15  # imm = 0x7F94C08E8A90
	movq	%fs:0, %rax
	movq	$0, -432(%rbp)
	movq	$0, -440(%rbp)
	movq	$0, -448(%rbp)
	movq	$0, -456(%rbp)
	movq	$0, -464(%rbp)
	movq	$0, -472(%rbp)
	movq	$0, -480(%rbp)
	movq	$0, -488(%rbp)
	movq	$0, -496(%rbp)
	movq	$0, -504(%rbp)
	movq	$20, -520(%rbp)
	movq	-2672(%rax), %rcx
	movq	%rcx, -512(%rbp)
	leaq	-520(%rbp), %rcx
	movq	%rcx, -2672(%rax)
	leaq	-2672(%rax), %rax
Source line: 3
	movq	%rax, -720(%rbp)
	leaq	17504(%r12), %r14
	movl	$1000, %esi             # imm = 0x3E8
	movq	%r15, %rdi
	callq	*%r14
	movq	%rax, %rbx
	movq	%rbx, -504(%rbp)
	movl	$1000, %esi             # imm = 0x3E8
Source line: 4
	movq	%r15, %rdi
	callq	*%r14
	movq	%rax, -656(%rbp)
	movq	%rax, -496(%rbp)
	movl	$1, %r13d
Source line: 276
	leaq	191200(%r12), %r12
	movq	%r12, -664(%rbp)
	

In [108]:
ts = [normal3 normal1_t normal2_t normal3_t]
sum(ts,1)

1×4 Array{Float64,2}:
 1.43804e9  19.7253  18.8349  19.6047

In [10]:
ii=2
shuffle(1:factorial(ii%10)*100)

200-element Array{Int64,1}:
 123
 147
 169
 118
  18
 182
   9
 144
  20
   3
  88
  50
  77
 116
  16
 141
 183
 170
  55
  33
 164
  12
  90
 138
 106
 133
  74
  98
  99
  15
  62
 153
  43
 165
 178
   2
  76
 171
  37
 161
  19
  63
  70
  10
 132
 120
  17
 159
 155
 195
 127
 148
  44
  41
 160
 102
  59
  85
  95
   7
 135
 115
   8
  25
 143
 146
 184
 113
 131
  80
  91
 129
  69
  21
 100
  89
 136
 154
 139
  28
 191
 188
 119
  72
 168
 163
  83
 128
  22
 177
  40
 187
 151
  45
 194
 176
  96
 190
 145
  54
  35
 126
 167
   5
  26
  93
  36
  32
 189
  65
 192
 152
 111
 117
  97
 179
 156
  47
 112
   1
  68
  92
  73
 142
 162
 186
  48
  82
  31
 173
  61
  46
 107
 108
 114
  81
 172
 149
 101
 198
 199
 157
 193
  56
  42
 181
  38
 197
  34
  14
   6
 124
 103
  52
 121
  53
 180
  87
 196
 110
 109
 134
  13
 175
  78
 150
 158
   4
 200
  66
  27
  79
  57
  39
  64
 105
 185
  86
 104
 122
  49
  84
 174
  24
 166
  23
  60
  29
  11
  75
  30
 140
  94
  67
 1

In [None]:
;git add "WorkStealing.ipynb"

In [None]:
type Counter
  count
end

function docount(counter::Counter, id, iobuff)
  for i = 1:10
    println(iobuff, @sprintf("%f",time()),"\t $id: 1 i=$i counter.count=$(counter.count)")
    println(iobuff,@sprintf("%f",time()), "\t\t 2 $id: i=$i counter.count=$(counter.count)")
    counter.count += 1
    println(iobuff,@sprintf("%f",time()),"\t\t\t 3 $id: i=$i counter.count=$(counter.count)")
    if counter.count == 15
            println(iobuff, @sprintf("%f",time()),"\t $id terminating")
      return
    end
  end
end

function docount1(counter::Counter, id, iobuff)
  for i = 1:10
    println(iobuff, @sprintf("%f",time()),"\t $id: 1 i=$i counter.count=$(counter.count)")
    println(iobuff,@sprintf("%f",time()), "\t\t 2 $id: i=$i counter.count=$(counter.count)")
    counter.count += 1
    println(iobuff,@sprintf("%f",time()),"\t\t\t 3 $id: i=$i counter.count=$(counter.count)")
    if counter.count == 15
            println(iobuff, @sprintf("%f",time()),"\t $id terminating")
      return
    end
  end
end

counter = Counter(0)

for a in 1:100
    iobuff_$a = IOBuffer()
    
end


ll = TatasLock()
@sync begin
    @async @with_lock(ll,docount(counter,"a",iobuff_a))
    @async @with_lock(ll,docount(counter,"b",iobuff_b))
end

@show(counter.count)
notes_a = split( takebuf_string(iobuff_a),"\n" )
notes_b = split( takebuf_string(iobuff_b),"\n" )
println(join(sort([notes_a; notes_b]),"\n"))