In [1]:
using Iterators
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")



4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [2]:
@everywhere using Distances

In [78]:
const START_MARKER = "**START**"
const END_MARKER = "**END**"

atis_data = open("atis_data.jsz","r") do fh
    deserialize(fh)    
end

@assert START_MARKER in atis_data["indexed_words"]
@assert START_MARKER in keys(atis_data["word_indexes"])
@assert END_MARKER in atis_data["indexed_words"]
@assert END_MARKER in keys(atis_data["word_indexes"])



In [4]:
# τ = Pheromone
# η = Prior
# α = pheremone weight
# β = prior weight
# ρ = pheremone evaporation
function get_edge_probs(τ::AbstractMatrix,η::AbstractMatrix, α, β)
    edges = τ.^α .* η.^β
    edges./=sum(edges,1) #Normalise to probabilities
    @inbounds edges[isnan(edges)]=0.0 #The terminal node will have a sum of zero
    edges
end
    
function get_edge_probs(τ::AbstractSparseMatrix,η::AbstractSparseMatrix, α, β)
    edges = τ.^α .* η.^β
    sums = sum(edges,1)
    
    I,J,V = findnz(edges)
    for idx in 1:length(V)
        @inbounds V[idx] /= sums[J[idx]]
    end
    sparse(I,J,V)
end



get_edge_probs (generic function with 2 methods)

In [5]:
?methodswith

search: 

```
methodswith(typ[, module or function][, showparents])
```

Return an array of methods with an argument of type `typ`. If optional `showparents` is `true`, also return arguments with a parent type of `typ`, excluding type `Any`.

The optional second argument restricts the search to a particular module or function.


In [6]:
@everywhere @inline function select_node{F<:Number}(edge_probs::AbstractVector{F})
    ys, _, probs = findnz(edge_probs'')
    select_node(zip(ys,probs))
end

@everywhere @inline function select_node{A, F<:Number}(edge_probs::Dict{A, F})
    select_node(collect(edge_probs))
end

@everywhere function select_node(edge_probs)
    cutoff = rand()
    total = 0.0
    @assert(length(edge_probs)>0, "No options from point $x")
    for (y,prob) in edge_probs
        total+=prob
        if total>=cutoff
            return y
        end
    end
    
    @assert(total>=cutoff, "x=$x, total= $total, cutoff=$cutoff") #Should Never Reach this point
end


methodswith



In [7]:
@everywhere function run_ant(edge_probs, start_index, end_index)
    path::Vector{Int} = [start_index] #1 is always the start node. Attached to it are unigram probs
    while (path[end]!=end_index)
        push!(path, select_node(edge_probs[:,path[end]]))
    end
    path    
end



function optimise(fit_fun, η, α, β,ρ, n_ants, n_gens, start_index, end_index)
    assert(α>=1.0)
    assert(β>=1.0)
    assert(0.0<=ρ<1.0)
    
    max_fit::Float64 = -Inf;
    fittest_path::Vector{Int} = []
    τ = ceil(η) #Since we take the element wise product with η, there is no point setting any elements that are zero in η
    
    for gen_ii in 1:n_gens
        if gen_ii%10==1 || gen_ii<10
            print("Gen: $gen_ii - fit: $max_fit\n")
        end
        
        edge_probs = get_edge_probs(τ,η, α, β)
        #edge_probs = SharedArray(Float64, size(τ))
        #edge_probs[:]=get_edge_probs(τ,η, α, β)
        
        
        path_fits::Vector{Tuple{Vector{Int},Float64}} = map(1:n_ants) do ant_id
            path::Vector{Int} = run_ant(edge_probs, start_index, end_index)
            fit::Float64 = fit_fun(path)
            (path, fit)
        end
        
    
        τ.*=(1.0-ρ) #Evaporation
        for (path, fit) in path_fits
            if fit>max_fit
                max_fit = fit
                fittest_path = path
                
                if fit==Inf #Perfect Fit
                    return fittest_path, max_fit
                end
            end

            
            @simd for node_ii in 1:length(path)-1
                @inbounds node = path[node_ii]
                @inbounds next_node = path[node_ii+1]
                @inbounds τ[next_node, node] += fit
            end
        end
    end #Next Generation
    
    fittest_path, max_fit
end

optimise (generic function with 1 method)

In [8]:
function lookup_sowe(data, sent)
    sum([data["LL"][:,data["word_indexes"][word]] for word in split(sent)]) 
end

lookup_sowe (generic function with 1 method)

In [9]:
function optimise(data::Dict, target_sent::AbstractString, α, β, ρ, n_ants, n_gens)
    target::Vector{Float64} = lookup_sowe(data,target_sent)
    optimise(data, target, α, β,ρ, n_ants, n_gens)
end

@inline function optimise(data::Dict, target::AbstractVector, α, β,ρ, n_ants, n_gens)
    #rLLs = Dict([id=>put!(RemoteRef(), data["LL"]) for id in workers()])
    #LL = SharedArray(Float64, size(atis_data["LL"]))
    LL =atis_data["LL"]
    @inline function fitness(path)
        #Fitter is larger
        @inbounds path_end = sum([LL[:,ii] for ii in path]) 
        if all(path_end.==0.0)
            0.0 #Cosine dist from origin to anywhere is zero.
            #This occurs for forced stopwords
        else
            (1.0/cosine_dist(path_end, target))/length(path)
        end
    end
    
    start_index = data["word_indexes"][START_MARKER]
    end_index = data["word_indexes"][END_MARKER]
    
    
    #return fitness, data["bigrams"], α, β,ρ, n_ants, n_gens, start_index, end_index
    path, score = optimise(fitness, data["bigrams"], α, β,ρ, n_ants, n_gens, start_index, end_index)
    score,[data["indexed_words"][ii] for ii in path]
end

optimise (generic function with 3 methods)

In [99]:



function get_prior_fun(uni_occur, bi_joint, prob_length, end_index)
    n_edges=length(uni_occur)
    bi_marginal = sum(bi_joint,1)
    bi_conditional = bi_joint./bi_marginal
    
    force_end_pmf = zeros(n_edges) #This PMF forces the sentence to end, if it is returned
    force_end_pmf[end_index]=1.0
    
    function approx_tri_joint(aa, bb, cc)
        #See http://stats.stackexchange.com/questions/175153/approximating-pa-b-c-using-pa-b-pa-c-pb-c-and-pa-pb-pc/    
        (bi_marginal[aa]*bi_joint[bb,cc]
         + bi_marginal[bb]*bi_joint[aa,cc]
         + bi_marginal[cc]*bi_joint[aa,bb]) / 3.0
    end
    
    
    function prior(ws)
        if length(ws) == 0
            uni_occur
        elseif (length(ws) > length(prob_length) #If it is longer than the tail of length distro
            || rand()<prob_length[length(ws)])   #Chance to end at this length
            force_end_pmf
        elseif length(ws)==1
            bi_conditional[:,words_so_far[1]]
        elseif length(ws)==2
            Float64[approx_tri_joint(aa, ws[1],ws[2]) for aa in 1:n_edges]./bi_occur[ws[1],ws[2]]
        

        end
    end
    
end

get_prior_fun (generic function with 1 method)

In [98]:
bm = sum(bo,1)

1x1121 Array{Float64,2}:
 0.00754159  0.00478645  0.00462697  …  0.028573  0.000416747  0.000416747

In [97]:
for aa in 1:length(bm)
    approx_trioccur(aa)/
end

2-element Array{Int64,1}:
 0
 0

In [60]:
bo[:,4]

1121-element Array{Float64,1}:
 9.7402e-5  
 8.77362e-5 
 0.000107811
 2.41646e-5 
 0.000274361
 1.56141e-5 
 1.11529e-6 
 7.43527e-7 
 0.000631254
 4.60987e-5 
 1.11529e-6 
 2.23058e-6 
 1.93317e-5 
 ⋮          
 3.71763e-7 
 7.43527e-7 
 7.43527e-7 
 3.71763e-7 
 3.71763e-7 
 0.000108183
 9.18256e-5 
 0.00024871 
 0.000199265
 0.000357636
 3.71763e-7 
 3.71763e-7 

1121x2 Array{Float64,2}:
 0.00754159   0.012318   
 0.00478645   0.00821975 
 0.00462697   0.00765314 
 0.00960562   0.0191794  
 0.0159855    0.0266696  
 0.00200157   0.00254587 
 0.000434591  2.32854e-5 
 0.000476601  9.31417e-5 
 0.0223151    0.0416654  
 0.0026845    0.00326772 
 0.000540916  0.000225092
 0.000772153  0.000426899
 0.00105432   0.00122637 
 ⋮                       
 0.000433104  3.10472e-5 
 0.000445373  5.43326e-5 
 0.000437194  4.65708e-5 
 0.000494445  0.000162998
 0.00042158   7.76181e-6 
 0.00848364   0.0139635  
 0.00793492   0.0113167  
 0.0118745    0.0192338  
 0.00688022   0.0120929  
 0.028573     0.0505682  
 0.000416747  0.0        
 0.000416747  0.0        

In [73]:
sum(bo[:,5])

0.0159854536424075

In [74]:
sum(bo,2)

1121x1 Array{Float64,2}:
 0.00754159 
 0.00478645 
 0.00462697 
 0.00960562 
 0.0159855  
 0.00200157 
 0.000434591
 0.000476601
 0.0223151  
 0.0026845  
 0.000540916
 0.000772153
 0.00105432 
 ⋮          
 0.000433104
 0.000445373
 0.000437194
 0.000494445
 0.00042158 
 0.00848364 
 0.00793492 
 0.0118745  
 0.00688022 
 0.028573   
 0.000416747
 0.000416747

1121x1121 Array{Float64,2}:
 0.00813369   0.00613592   0.0188012    …  0.000892061  0.000892061
 0.00389431   0.00365049   0.00522256      0.000892061  0.000892061
 0.011535     0.00504854   0.00393701      0.000892061  0.000892061
 0.0129153    0.0183301    0.0233007       0.000892061  0.000892061
 0.016366     0.0120388    0.0204082       0.000892061  0.000892061
 0.00650695   0.000776699  0.000482083  …  0.000892061  0.000892061
 4.92951e-5   7.76699e-5   0.000241041     0.000892061  0.000892061
 0.00019718   0.00023301   0.000482083     0.000892061  0.000892061
 0.0483092    0.0594175    0.0414591       0.000892061  0.000892061
 0.00478162   0.00240777   0.00707054      0.000892061  0.000892061
 0.000394361  0.00015534   0.000803471  …  0.000892061  0.000892061
 0.00147885   0.000466019  0.000321388     0.000892061  0.000892061
 0.00113379   0.00108738   0.000642777     0.000892061  0.000892061
 ⋮                                      ⋱               ⋮          
 0.000147885  0.0003

In [89]:
bo[:,3]/sum(bo[:,3])


1121-element Array{Float64,1}:
 0.0188012  
 0.00522256 
 0.00393701 
 0.0233007  
 0.0204082  
 0.000482083
 0.000241041
 0.000482083
 0.0414591  
 0.00707054 
 0.000803471
 0.000321388
 0.000642777
 ⋮          
 8.03471e-5 
 0.000160694
 8.03471e-5 
 8.03471e-5 
 8.03471e-5 
 0.0123735  
 0.0127752  
 0.0259521  
 0.0117307  
 0.0356741  
 8.03471e-5 
 8.03471e-5 

In [50]:
oo = bo[:,5]./uo[5]
oo[isinf(oo)]=0.0 #remove anything that had a unioccur of zero
sum(oo)

0.5993893787756731

In [79]:
bo=atis_data["bioccur"]
uo=atis_data["unioccur"]

1121-element Array{Float64,1}:
 0.012318   
 0.00821975 
 0.00765314 
 0.0191794  
 0.0266696  
 0.00254587 
 2.32854e-5 
 9.31417e-5 
 0.0416654  
 0.00326772 
 0.000225092
 0.000426899
 0.00122637 
 ⋮          
 3.10472e-5 
 5.43326e-5 
 4.65708e-5 
 0.000162998
 7.76181e-6 
 0.0139635  
 0.0113167  
 0.0192338  
 0.0120929  
 0.0505682  
 0.0        
 0.0        

In [14]:
sum(bo[6,:])

0.0020015740461573972

In [15]:
bo[:,6]./uo

1121-element Array{Any,1}:
   0.00398383 
   0.000452281
   0.000291459
   0.000814105
   0.00111517 
   0.00073013 
   0.0159655  
   0.00399138 
   0.0011064  
   0.000796379
   0.00330321 
   0.00261254 
   0.00121257 
   ⋮          
   0.0119741  
   0.00684236 
   0.00798275 
   0.00228079 
   0.0478965  
   0.0025559  
   0.00164254 
   0.00411701 
   0.0007993  
   0.00185264 
 Inf          
 Inf          

In [16]:
@time optimise(atis_data,"what is the longest flight", 1, 1, 0.3, 10000,20)
#code_warntype(optimise, map(typeof,params))

Gen: 1 - fit: -Inf
Gen: 2 - fit: 2.841247204959871
Gen: 3 - fit: 2.841247204959871
Gen: 4 - fit: 2.841247204959871
Gen: 5 - fit: 2.841247204959871
Gen: 6 - fit: 2.841247204959871
Gen: 7 - fit: 2.841247204959871
Gen: 8 - fit: 2.841247204959871
Gen: 9 - fit: 2.841247204959871
Gen: 11 - fit: 2.841247204959871
 68

(2.841247204959871,Any["**START**","what","is","the","flight","**END**"])

In [17]:
optimise(atis_data,"what is the shortest flight", 1, 1, 0.1, 10000,20)

.628699 seconds (34.20 M allocations: 87.897 GB, 16.47% gc time)
Gen: 1 - fit: -Inf
Gen: 2 - fit: 2.9412664090410554
Gen: 3 - fit: 2.9412664090410554
Gen: 4 - fit: 2.9412664090410554
Gen: 5 - fit: 2.9412664090410554
Gen: 6 - fit: 2.9412664090410554
Gen: 7 - fit: 2.9412664090410554
Gen: 8 - fit: 2.9412664090410554
Gen: 9 - fit: 2.9412664090410554
Gen: 11 - fit: 2.9412664090410554


(2.9412664090410554,Any["**START**","what","is","the","flight","**END**"])

In [18]:
optimise(atis_data,"what is the first flight from washington to washington", 1, 1, 0.1, 10000,20)

Gen: 1 - fit: -Inf
Gen: 2 - fit: 1.5393241009513545
Gen: 3 - fit: 1.9561430275898823
Gen: 4 - fit: 1.9561430275898823
Gen: 5 - fit: 1.9561430275898823
Gen: 6 - fit: 1.9643387905327887
Gen: 7 - fit: 1.9643387905327887
Gen: 8 - fit: 1.9643387905327887
Gen: 9 - fit: 1.9643387905327887
Gen: 11 - fit: 1.9643387905327887


(1.9643387905327887,Any["**START**","what","is","the","flight","from","boston","to","washington","**END**"])

In [19]:
optimise(atis_data,"what are the ground transport options", 1, 1, 0.1, 1000,20)

Gen: 1 - fit: -Inf
Gen: 2 - fit: 0.9565541331530678
Gen: 3 - fit: 0.9565541331530678
Gen: 4 - fit: 0.9565541331530678
Gen: 5 - fit: 0.9565541331530678
Gen: 6 - fit: 0.9565541331530678
Gen: 7 - fit: 0.9565541331530678
Gen: 8 - fit: 0.9565541331530678
Gen: 9 - fit: 0.9565541331530678
Gen: 11 - fit: 0.9565541331530678


(0.9565541331530678,Any["**START**","what","ground","transportation","**END**"])

In [20]:
optimise(atis_data,"where can i go", 1, 1, 0.1, 1000,20)

Gen: 1 - fit: -Inf
Gen: 2 - fit: 0.9867084973755195
Gen: 3 - fit: 0.9867084973755195
Gen: 4 - fit: 0.9867084973755195
Gen: 5 - fit: 0.9867084973755195
Gen: 6 - fit: 0.9867084973755195
Gen: 7 - fit: 0.9867084973755195
Gen: 8 - fit: 0.9867084973755195
Gen: 9 - fit: 0.9867084973755195
Gen: 11 - fit: 0.9867084973755195


(0.9867084973755195,Any["**START**","can","you","**END**"])

In [21]:
atis_data["word_indexes"][START_MARKER]
    

1120

In [76]:
`git push""` |> run

Git 2.0 from 'matching' to 'simple'. To squelch this message
and maintain the current behavior after the default changes, use:

  git config --global push.default matching

To squelch this message and adopt the new behavior now, use:

  git config --global push.default simple

When push.default is set to 'matching', git will push local branches
to the remote branches that already exist with the same name.

In Git 2.0, Git will default to the more conservative 'simple'
behavior, which only pushes the current branch to the corresponding
remote branch that 'git pull' uses to update the current branch.

See 'git help config' and search for 'push.default' for further information.
(the 'simple' mode was introduced in Git 1.7.11. Use the similar mode
'current' instead of 'simple' if you sometimes use older versions of Git)

To git@github.com:oxinabox/phd.git
   c4a90bd..da9fd80  master -> master


In [75]:
`git commit -m="Have not quet worked out my occurance statistics but am getting there" -a` |> run

[master da9fd80] =Have not quet worked out my occurance statistics but am getting there
 Committer: = <ubuntu@danu.rc.pawsey.org.au>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly:

    git config --global user.name "Your Name"
    git config --global user.email you@example.com

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 2 files changed, 5063 insertions(+), 1583 deletions(-)
 rewrite prototypes/SumOfWordEmbeddings/AntSubsetSub.ipynb (83%)
