In [1]:
using Iterators
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [2]:
@everywhere using Distances

In [3]:
const START_MARKER = "**START**"
const END_MARKER = "**END**"

atis_data = open("atis_data.jsz","r") do fh
    deserialize(fh)    
end

@assert START_MARKER in atis_data["indexed_words"]
@assert START_MARKER in keys(atis_data["word_indexes"])
@assert END_MARKER in atis_data["indexed_words"]
@assert END_MARKER in keys(atis_data["word_indexes"])

In [None]:
# τ = Pheromone
# η = Prior
# α = pheremone weight
# β = prior weight
# ρ = pheremone evaporation
function get_edge_probs(τ::AbstractMatrix,η::AbstractMatrix, α, β)
    edges = τ.^α .* η.^β
    edges./=sum(edges,1) #Normalise to probabilities
    @inbounds edges[isnan(edges)]=0.0 #The terminal node will have a sum of zero
    edges
end
    
function get_edge_probs(τ::AbstractSparseMatrix,η::AbstractSparseMatrix, α, β)
    edges = τ.^α .* η.^β
    sums = sum(edges,1)
    
    I,J,V = findnz(edges)
    for idx in 1:length(V)
        @inbounds V[idx] /= sums[J[idx]]
    end
    sparse(I,J,V)
end

In [None]:
@everywhere @inline function select_node{F<:Number}(edge_probs::AbstractVector{F})
    ys, _, probs = findnz(edge_probs'')
    select_node(zip(ys,probs))
end

@everywhere @inline function select_node{A, F<:Number}(edge_probs::Dict{A, F})
    select_node(collect(edge_probs))
end

@everywhere function select_node(edge_probs)
    cutoff = rand()
    total = 0.0
    @assert(length(edge_probs)>0, "No options from point $x")
    for (y,prob) in edge_probs
        total+=prob
        if total>=cutoff
            return y
        end
    end
    
    @assert(total>=cutoff, "x=$x, total= $total, cutoff=$cutoff") #Should Never Reach this point
end


In [None]:
@everywhere function run_ant(edge_probs, start_index, end_index)
    path::Vector{Int} = [start_index] #1 is always the start node. Attached to it are unigram probs
    while (path[end]!=end_index)
        push!(path, select_node(edge_probs[:,path[end]]))
    end
    path    
end



function optimise(fit_fun, η, α, β,ρ, n_ants, n_gens, start_index, end_index)
    assert(α>=1.0)
    assert(β>=1.0)
    assert(0.0<=ρ<1.0)
    
    max_fit::Float64 = -Inf;
    fittest_path::Vector{Int} = []
    τ = ceil(η) #Since we take the element wise product with η, there is no point setting any elements that are zero in η
    
    for gen_ii in 1:n_gens
        if gen_ii%10==1 || gen_ii<10
            print("Gen: $gen_ii - fit: $max_fit\n")
        end
        
        edge_probs = get_edge_probs(τ,η, α, β)
        #edge_probs = SharedArray(Float64, size(τ))
        #edge_probs[:]=get_edge_probs(τ,η, α, β)
        
        
        path_fits::Vector{Tuple{Vector{Int},Float64}} = map(1:n_ants) do ant_id
            path::Vector{Int} = run_ant(edge_probs, start_index, end_index)
            fit::Float64 = fit_fun(path)
            (path, fit)
        end
        
    
        τ.*=(1.0-ρ) #Evaporation
        for (path, fit) in path_fits
            if fit>max_fit
                max_fit = fit
                fittest_path = path
                
                if fit==Inf #Perfect Fit
                    return fittest_path, max_fit
                end
            end

            
            @simd for node_ii in 1:length(path)-1
                @inbounds node = path[node_ii]
                @inbounds next_node = path[node_ii+1]
                @inbounds τ[next_node, node] += fit
            end
        end
    end #Next Generation
    
    fittest_path, max_fit
end

In [None]:
function lookup_sowe(data, sent)
    sum([data["LL"][:,data["word_indexes"][word]] for word in split(sent)]) 
end

In [None]:
function optimise(data::Dict, target_sent::AbstractString, α, β, ρ, n_ants, n_gens)
    target::Vector{Float64} = lookup_sowe(data,target_sent)
    optimise(data, target, α, β,ρ, n_ants, n_gens)
end

@inline function optimise(data::Dict, target::AbstractVector, α, β,ρ, n_ants, n_gens)
    #rLLs = Dict([id=>put!(RemoteRef(), data["LL"]) for id in workers()])
    #LL = SharedArray(Float64, size(atis_data["LL"]))
    LL =atis_data["LL"]
    @inline function fitness(path)
        #Fitter is larger
        @inbounds path_end = sum([LL[:,ii] for ii in path]) 
        if all(path_end.==0.0)
            0.0 #Cosine dist from origin to anywhere is zero.
            #This occurs for forced stopwords
        else
            (1.0/cosine_dist(path_end, target))/length(path)
        end
    end
    
    start_index = data["word_indexes"][START_MARKER]
    end_index = data["word_indexes"][END_MARKER]
    
    
    #return fitness, data["bigrams"], α, β,ρ, n_ants, n_gens, start_index, end_index
    path, score = optimise(fitness, data["bigrams"], α, β,ρ, n_ants, n_gens, start_index, end_index)
    score,[data["indexed_words"][ii] for ii in path]
end

In [26]:
#Optimised version of partitions(xs,2),
subset_indexes_for_len = Dict{Int,Vector{Vector{Bool}}}()
function nonempty_subset_splits{T}(xs::Vector{T})
    K = length(xs)
    ss ::Vector{Vector{Bool}} = if haskey(subset_indexes_for_len,K)
            subset_indexes_for_len[K]
         else 
            subset_indexes_for_len[K] = Vector{Bool}[collect(pr) for (_,pr) in  zip(1:(K^2)/2, product([(true,false) for k in 1:K]...))][2:end]
            subset_indexes_for_len[K]
         end

    Tuple{Vector{T},Vector{T}}[(xs[bb],xs[~bb]) for bb in ss]
end

function nonempty_subset_splits(xs)
    nonempty_subset_splits(collect(xs))
end


nonempty_subset_splits (generic function with 2 methods)

In [None]:
function pairwise(pp)
    Task() do 
        for ii in 2:2:length(pp)
            produce((pp[ii-1],pp[ii]))
        end
        if isodd(length(pp))
            produce(pp[end]) 
        end
    end
end

_partions_of_pairs_indexes=Dict{Vector{Int64},Vector{Vector{Int64}}}()
function partions_of_pairs_indexes{Int64}(xs::Vector{Int64})
    if !haskey(_partions_of_pairs_indexes, xs)
        _partions_of_pairs_indexes[xs] = 
            if length(xs)<=2
                Vector{Int64}[xs]
            else
                rets = Vector{Vector{Int64}}()
                sizehint!(rets, length(xs)^div(length(xs),2))
                for i1 in 1:length(xs-1)
                    for i2 in i1+1:length(xs)
                        head = (xs[[i1,i2]])
                        tails = partions_of_pairs_indexes(xs[[1:i1-1;i1+1:i2-1;i2+1:end]])

                        for tail in tails
                                push!(rets, [head; tail])
                            end
                        end
                end
                rets
            end
    end
    _partions_of_pairs_indexes[xs] 
end

function partions_of_pairs(xs::Vector{Int64})
    partions_of_pairs_indexes(xs)
end


In [28]:
using DataStructures
function DataStructures.counter(T::DataType,seq)
    ct = counter(T)
    for x::T in seq
        push!(ct, x)
    end
    return ct
end

function =={K,V}(lhs::DataStructures.Accumulator{K,V},rhs::DataStructures.Accumulator{K,V})
    lhs.map == rhs.map    
end


function Base.hash{K,V}(obj::DataStructures.Accumulator{K,V},h::UInt64)
    hash(obj.map,h)
end

hash (generic function with 43 methods)

In [29]:
@fastmath function get_prior_fun(uni_occur, bi_joint, prob_length, end_index)
    n_edges=length(uni_occur)
    bi_marginal = sum(bi_joint,1)
    bi_conditional = bi_joint./bi_marginal
  
    
    @inline function joint(aa::Int64)
        @inbounds ret = bi_marginal[aa]
        ret
    end
    
    @inline function joint(aa::Int64,bb::Int64)
        @inbounds ret = bi_joint[aa, bb]
        ret
    end
    
    joint_cache_stats = Dict([("hits",0), ("misses",0)])
    joint_cache = Dict{Accumulator{Int64,Int64}, Float64}() #TODO Should Use a multiset instead to index
    @inline function joint(xxs...) 
        @assert(length(xxs)>=3)
        xxs_bag = counter(Int64, xxs) 
        if !(haskey(joint_cache,xxs_bag))
            joint_cache_stats["misses"]+=1
            #Full calc
            joint_cache[xxs_bag] = mean(Float64[(joint(aas...)*joint(bbs...)) for (aas,bbs) in  nonempty_subset_splits(xxs)])
            
            #Pairwise Calc Only Marginally faster, and has bugs
            #kk = length(xxs)
            #nComponants = (div(kk^2+kk,2)-1)
            #total_prob=1.0
            #for i1 in 1:length(xxs)-1
            #    for i2 in i1:length(xxs)
            #        head = (xxs[i1],xxs[i2])
            #        head_prob = joint(head...)
            #        tail = xxs[[1:i1-1;i1+1:i2-1;i2+1:end]]
            #        tail_prob = joint(tail...)
            #        total_prob+=head_prob*tail_prob
            #    end
            #end
            #joint_cache[xxs_bag] = total_prob/nComponants
        else
            joint_cache_stats["hits"]+=1
        end
        joint_cache[xxs_bag]
    end
    
    conditional_occurance::Function
    @inbounds function conditional_occurance(ws::Vector{Int64})
        if length(ws) == 0
            uni_occur
        elseif length(ws)==1
            bi_conditional[:,ws[1]]
        else
            Float64[joint(aa,ws...) for aa in 1:n_edges]./joint(ws...)
        end
    end
    
    function prior(ws)
        if  length(ws) > length(prob_length) #If it is longer than the tail of length distro, just force temrination
            force_end_pmf = zeros(n_edges) #This PMF forces the sentence to end, if it is returned
            @inbounds force_end_pmf[end_index]=1.0
            force_end_pmf
        else 
            #Either do not end (P=1-P_end) or end (P=P_end)
            #prob_end = prob_length[length(ws)]   #Chance to end at this length
            prob_end=0.0
            prob_occurance::Vector{Float64} = conditional_occurance(ws)
            
            choices_prob = (1.0-prob_end).*prob_occurance
            #@inbounds choices_prob[end_index]=prob_end
            choices_prob
        end 
    end
    prior,joint_cache_stats
end

function get_prior_fun(data::Dict)
    get_prior_fun(data["unioccur"],data["bioccur"], data["length_prob"], data["word_indexes"][END_MARKER])
end

get_prior_fun (generic function with 2 methods)

In [30]:
atis_prior,atis_prior_cache_stats = get_prior_fun(atis_data)
r = @time atis_prior(Int64[1,2,5,4,5,6,7]) #9.335965 seconds
println(atis_prior_cache_stats)
r

 10

1121-element Array{Float64,1}:
 0.00867902 
 0.00430445 
 0.00616229 
 0.0123616  
 0.0151555  
 0.00216173 
 0.000264517
 0.00042264 
 0.0371306  
 0.00286149 
 0.000387054
 0.000674   
 0.00113445 
 ⋮          
 0.00030314 
 0.000311481
 0.000280357
 0.000392338
 0.000260774
 0.0116674  
 0.0101306  
 0.0198284  
 0.00916513 
 0.0370731  
 0.000253558
 0.000253558

In [31]:
 0.00789928 
 0.00391773 
 0.00560866 
 0.011251   
 0.0137939  
 0.00196752 
 0.000240752
 0.00038467 
 0.0337947  
 0.00260441 

0.00260441

.250687 seconds (17.85 M allocations: 959.103 MB, 9.69% gc time)
Dict("hits"=>558752,"misses"=>68256)


In [None]:
nonempty_subset_splits(xxs)

In [None]:
@time optimise(atis_data,"what is the longest flight", 1, 1, 0.3, 10000,20)
#code_warntype(optimise, map(typeof,params))

In [None]:
optimise(atis_data,"what is the shortest flight", 1, 1, 0.1, 10000,20)

In [None]:
optimise(atis_data,"what is the first flight from washington to washington", 1, 1, 0.1, 10000,20)

In [None]:
optimise(atis_data,"what are the ground transport options", 1, 1, 0.1, 1000,20)

In [None]:
optimise(atis_data,"where can i go", 1, 1, 0.1, 1000,20)

In [None]:
atis_data["word_indexes"][START_MARKER]
    

In [None]:
`git push""` |> run

In [None]:
`git commit -m="All joint is in, it is optimised but still slow" -a` |> run