In [5]:
using Iterators
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [6]:
@everywhere using Distances

In [7]:
const START_MARKER = "**START**"
const END_MARKER = "**END**"

atis_data = open("atis_data.jsz","r") do fh
    deserialize(fh)    
end

@assert START_MARKER in atis_data["indexed_words"]
@assert START_MARKER in keys(atis_data["word_indexes"])
@assert END_MARKER in atis_data["indexed_words"]
@assert END_MARKER in keys(atis_data["word_indexes"])

In [8]:
# τ = Pheromone
# η = Prior
# α = pheremone weight
# β = prior weight
# ρ = pheremone evaporation

@everywhere @inline function select_index{V<:AbstractVector}(edge_probs::V)
    cutoff = rand()
    total = 0.0
    @assert(length(edge_probs)>0, "No probs given")
    for (ii,prob) in enumerate(edge_probs)
        total+=prob
        if total>=cutoff
            return ii
        end
    end
    
    @assert(total>=cutoff, "total= $total, cutoff=$cutoff, $edge_probs") #Should Never Reach this point
end


@everywhere function evaporate!{M<:AbstractArray}(τ::M, ρ)
    τ.*=(1.0-ρ) #Evaporation
end

    

@everywhere function deposit!{M<:AbstractVector}(τ::M, path, fit)
    #Are You In or Out?  Double Crosser or Devout
    #Order actually means nothing in this problem
    #Cooccurance doesn't really many anything either
    #Cooccurance is a problem for the prior that need 
    #to be conerned with all answers not just this one
    for node in path
        @inbounds τ[node] += fit./length(path)
    end
    τ
end

@everywhere function get_edge_probs(τ,η::Function, α, β)
    τ_scaled = (τ).^α
    function get_prob(state)
        η_scaled = (η(state)).^β
        tot = τ_scaled .* τ_scaled
        tot./sum(tot)
    end
end

@everywhere @inline function select_node(edge_probs::Function, path=[])
    select_index(edge_probs(path))
end


In [9]:
sort!([-1,-2,32,23,42], by=pf->pf)

5-element Array{Int64,1}:
 -2
 -1
 23
 32
 42

In [10]:
#If you pass in a start index of greater than zero, then atre using a starting index, otherswise not
@everywhere function run_ant(edge_probs, end_prob_dist)
    path = Int[]
    len = select_index(end_prob_dist)
    for _ in 1:len
        push!(path, select_node(edge_probs, path))
    end
    path    
end


function optimise(fit_fun, τ, η, end_prob_dist;  α=1, β=1,ρ=0, n_ants=100, n_gens=10, callback=Union{})
    assert(α>=1.0)
    assert(β>=1.0)
    assert(0.0<=ρ<1.0)
    
    max_fit::Float64 = -Inf;
    fittest_path::Vector{Int} = []
    
    for gen_ii in 1:n_gens

        edge_probs = get_edge_probs(τ,η, α, β)

        path_fits::Vector{Tuple{Vector{Int},Float64}} = map(1:n_ants) do ant_id
            path::Vector{Int} = run_ant(edge_probs, end_prob_dist)
            fit::Float64 = fit_fun(path)
            (path, fit)
        end
        
        ranked_path_fits = select!(path_fits,1:100, by=pf->-pf[2])
        scales = 100:-1:1
        
        evaporate!(τ,ρ)
        for ((path, fit),scale) in zip(ranked_path_fits,scales)
        #for (path, fit) in path_fits
            if fit>max_fit
                max_fit = fit
                fittest_path = path
                
                if fit==Inf #Perfect Fit
                    return fittest_path, max_fit
                end
            end
            deposit!(τ,path, fit)
        end
        
        if callback |> typeof <:Function
            callback(gen_ii, τ,fittest_path,max_fit,path_fits)
        end
        
    end #Next Generation
    
    fittest_path, max_fit, τ
end


function optimise(fit_fun, n_dims::Int64, η::Function, end_prob_dist; kwargs...)
    τ = [rand() for nn in 1:n_dims]
    
    optimise(fit_fun, τ, η, end_prob_dist;kwargs...)
end


optimise (generic function with 2 methods)

In [11]:
@fastmath function get_prior_fun(uni_occur,bi_occur)
    n_edges=length(uni_occur)
    function prior(ws)
        if length(ws)==0
            uni_occur
        elseif length(ws)==1
            bi_occur[:,ws[1]]
        else
            (uni_occur + sum([bi_occur[:,w] for w in ws]))/(length(ws)+1) #Just average them cos its cheap
        end
    end    
end




get_prior_fun (generic function with 1 method)

In [12]:
function lookup_sowe(data, sent)
    sum([data["LL"][:,data["word_indexes"][word]] for word in split(sent)]) 
end

function lookup_words(data, path)
    [data["indexed_words"][ii] for ii in path]
end

lookup_words (generic function with 1 method)

In [13]:
function built_optimise(data::Dict, target_sent::AbstractString, log=False; kwargs...)
    target::Vector{Float64} = lookup_sowe(data,target_sent)
    LL =data["LL"]
    indexed_words=data["indexed_words"]
    @inline function fitness(path)
        #Fitter is larger
        @inbounds path_end = sum([LL[:,ii] for ii in path]) 
        -1.0*euclidean(path_end, target)
    end

    τlog=[]
    if log
        log = (gen_ii, τ,fittest_path,max_fit, path_fits) -> begin
            print("oGen: $gen_ii - fit: $max_fit ")
            print(lookup_words(data, fittest_path))
            #path_lens = [length(pf[1]) for pf in path_fits]
            #print("mean_len = $(mean(path_lens))")
            #print(" max_len = $(maximum(path_lens))")
            println()
            push!(τlog,copy(τ))
            
        end
    end
    
    n_dims = size(LL,2)
    length_dist = data["length_prob"]
    #length_dist = ws->0.5
    prior = get_prior_fun(data["unioccur"], data["bioccur"])
       
    
    
    path, score, τ = optimise(fitness,n_dims, prior, length_dist; callback=log, kwargs...)
    push!(τlog,τ)
    score,lookup_words(data, path), τlog
end

built_optimise (generic function with 2 methods)

In [None]:
@time score, words, τlog = built_optimise(atis_data,"show flights from washington to san francisco", true;
α=1, β=1, ρ=0.2, n_ants=10000, n_gens=10);
words

In [29]:
function subset_bool_indexes(K::Int)
    Vector{Bool}[[ss...] for ss in product([[true,false] for _ in 1:K]...)]
end
function subset_bool_indexes(vector)
    subset_bool_indexes(length(vector))
end

function greedy_local_search(data::Dict, target_sent::AbstractString, initial_words=AbstractString[]::Vector{AbstractString})
    target::Vector{Float64} = lookup_sowe(data,target_sent)
    greedy_local_search(data, target, initial_words)
end

function greedy_local_search(data::Dict, target::Vector{Float64}, initial_words=AbstractString[]::Vector{AbstractString})
    
    LL =data["LL"]
    indexed_words=data["indexed_words"]
    n_dims = size(LL,2)

    @inline function get_end(ws::Vector{Int64})
        @inbounds sofar = length(ws)>0 ? sum([LL[:,ii] for ii in ws]) : zeros(LL[:,1])
        sofar
    end
    
    @inline function fitness(path::Vector{Int64})
        #Fitter is smaller
        -euclidean(get_end(path), target)   
    end
    
    
    @fastmath function score_possible_additions(ws)
        @inbounds path_ends = LL.+get_end(ws)
        -sqrt(vec(sum((path_ends .- target).^2,1)))
    end   
    
    initial_word_set = Int[data["word_indexes"][word] for word in initial_words]
    best_word_set = initial_word_set
    best_score = fitness(best_word_set)
    did_improve=true
    
    if length(initial_words)>0
        did_improve=false
        for subset_inds in subset_bool_indexes(best_word_set)[1:end-1]
            word_set = initial_word_set[subset_inds]
            if fitness(word_set)>best_score
                best_score=fitness(word_set)
                best_word_set = word_set
                println("*$best_score $(lookup_words(data, best_word_set))")
                did_improve=true
            end

            addition_scores = score_possible_additions(word_set)
            addition_score, addition = findmax(addition_scores)
            if addition_score>best_score
                best_score=addition_score
                best_word_set = Int[word_set...,addition]
                println("!$best_score $(lookup_words(data, best_word_set))")
                did_improve=true
            end
        end
    end
    
    
    while (did_improve)
        did_improve=false
        addition_scores = score_possible_additions(best_word_set)
        addition_score, addition = findmax(addition_scores)
        if addition_score>best_score
            best_score=addition_score
            best_word_set = [best_word_set...,addition]
            println("+$best_score $(lookup_words(data, best_word_set))")
            did_improve=true
        end
    end
    
    best_score,lookup_words(data, best_word_set)
end


greedy_local_search (generic function with 5 methods)

In [30]:
greedy_local_search(atis_data, "show flights from washington to san francisco", AbstractString[])

+-2.5246724924254855 Any["francisco"]


(-0.0,Any["francisco","washington","from","flights","san","show"])

+-2.115438537963471 Any["francisco","washington"]
+-1.7577083221196999 Any["francisco","washington","from"]
+-1.3735055232576556 Any["francisco","washington","from","flights"]
+-0.9999999692848569 Any["francisco","washington","from","flights","san"]
+-0.0 Any["francisco","washington","from","flights","san","show"]


In [25]:
greedy_local_search(atis_data, "show cheapest flights")

+-1.4272728232394019 Any["flights"]


(-0.0,Any["flights","cheapest","show"])

+-0.9999999692848569 Any["flights","cheapest"]
+-0.0 Any["flights","cheapest","show"]


In [26]:
greedy_local_search(atis_data, "show cheapest flight from washington to san francisco", AbstractString[])

+-2.7960536435600543 Any["francisco"]


(-0.0,Any["francisco","washington","flight","from","cheapest","san","show"])

+-2.4235171879554462 Any["francisco","washington"]
+-2.0417987148573475 Any["francisco","washington","flight"]
+-1.6977284510791408 Any["francisco","washington","flight","from"]
+-1.3735055232576556 Any["francisco","washington","flight","from","cheapest"]
+-0.9999999692848569 Any["francisco","washington","flight","from","cheapest","san"]
+-0.0 Any["francisco","washington","flight","from","cheapest","san","show"]


In [28]:
greedy_local_search(atis_data, "show cheapest and shortest flights from washington to san francisco", AbstractString[])

+-3.170136926890734 Any["francisco"]
+-2.7940736483358637 Any["francisco","cheapest"]
+-2.4445759168104977 Any["francisco","cheapest","washington"]
+-2.098351878037111 Any["francisco","cheapest","washington","from"]
+-1.7267522992131603 Any["francisco","cheapest","washington","from","flights"]
+-1.3735055232576556 Any["francisco","cheapest","washington","from","flights","shortest"]
+-0.9999999692848569 Any["francisco","cheapest","washington","from","flights","shortest","san"]
+-0.0 Any["francisco","cheapest","washington","from","flights","shortest","san","show"]


(-0.0,Any["francisco","cheapest","washington","from","flights","shortest","san","show"])

In [53]:
target = lookup_sowe(atis_data,"show cheapest flights from washington to san francisco")
target += lookup_sowe(atis_data,"show shortest flights from washington to san francisco")
target./=1.6
greedy_local_search(atis_data,target)

+-3.5417787531885234 Any["francisco"]


(-0.9238781650806026,Any["francisco","washington","flights","from","san","show","cheapest","shortest"])

+-3.110976266049679 Any["francisco","washington"]
+-2.7018626517592645 Any["francisco","washington","flights"]
+-2.2880189617506828 Any["francisco","washington","flights","from"]
+-1.8743271387257674 Any["francisco","washington","flights","from","san"]
+-1.3589086657868763 Any["francisco","washington","flights","from","san","show"]
+-0.9844346359927165 Any["francisco","washington","flights","from","san","show","cheapest"]
+-0.9238781650806026 Any["francisco","washington","flights","from","san","show","cheapest","shortest"]


In [55]:
`git push` |> run

To git@github.com:oxinabox/phd.git
 ! [rejected]        master -> master (fetch first)


LoadError: LoadError: failed process: Process(`git push`, ProcessExited(1)) [1]
while loading In[55], in expression starting on line 1

error: failed to push some refs to 'git@github.com:oxinabox/phd.git'
hint: Updates were rejected because the remote contains work that you do
hint: not have locally. This is usually caused by another repository pushing
hint: to the same ref. You may want to first integrate the remote changes
hint: (e.g., 'git pull ...') before pushing again.
hint: See the 'Note about fast-forwards' in 'git push --help' for details.


In [None]:
using UnicodePlots
using IJulia

for τ in τlog
    nw = zip(τ,atis_data["indexed_words"]) |> collect
    
    freq,name = zip(select!(nw, 1:min(20,length(nw)), by=fw->-fw[1])...) |> collect
    IJulia.display(barplot(name|> collect,freq|> collect))
end


In [None]:
using DataStructures
using DataStructuresExtended


@fastmath function get_prior_fun(uni_occur, bi_joint, prob_length, end_index)
    n_edges=length(uni_occur)
    bi_marginal = sum(bi_joint,1)
    bi_conditional = bi_joint./bi_marginal
  
    
    @inline function joint(aa::Int64)
        @inbounds ret = bi_marginal[aa]
        ret
    end
    
    @inline function joint(aa::Int64,bb::Int64)
        @inbounds ret = bi_joint[aa, bb]
        ret
    end
    
    joint_cache_stats = Dict([("hits",0), ("misses",0)])
    joint_cache = Dict{Accumulator{Int64,Int64}, Float64}() #TODO Should Use a multiset instead to index
    @inline function joint(xxs...) 
        @assert(length(xxs)>=3)
        xxs_bag = counter(Int64, xxs) 
        if !(haskey(joint_cache,xxs_bag))
            joint_cache_stats["misses"]+=1
            #Full calc
            joint_cache[xxs_bag] = mean(Float64[(joint(aas...)*joint(bbs...)) for (aas,bbs) in  nonempty_subset_splits(xxs)])
            
            
        else
            joint_cache_stats["hits"]+=1
        end
        joint_cache[xxs_bag]
    end
    
    conditional_occurance::Function
    @inbounds function conditional_occurance(ws::Vector{Int64})
        if length(ws) == 0
            uni_occur
        elseif length(ws)==1
            bi_conditional[:,ws[1]]
        else
            Float64[joint(aa,ws...) for aa in 1:n_edges]./joint(ws...)
        end
    end
    
    function prior(ws)
        if  length(ws) > length(prob_length) #If it is longer than the tail of length distro, just force temrination
            force_end_pmf = zeros(n_edges) #This PMF forces the sentence to end, if it is returned
            @inbounds force_end_pmf[end_index]=1.0
            force_end_pmf
        else 
            #Either do not end (P=1-P_end) or end (P=P_end)
            #prob_end = prob_length[length(ws)]   #Chance to end at this length
            prob_end=0.0
            prob_occurance::Vector{Float64} = conditional_occurance(ws)
            
            choices_prob = (1.0-prob_end).*prob_occurance
            #@inbounds choices_prob[end_index]=prob_end
            choices_prob
        end 
    end
    prior,joint_cache_stats
end

function get_prior_fun(data::Dict)
    get_prior_fun(data["unioccur"],data["bioccur"], data["length_prob"], data["word_indexes"][END_MARKER])
end

In [None]:
atis_prior,atis_prior_cache_stats = get_prior_fun(atis_data)
r = @time atis_prior(Int64[1,2,5,4,5,6,7]) #9.335965 seconds
println(atis_prior_cache_stats)
r

In [None]:
atis_data["indexed_words"][200:300]

In [None]:
optimise(atis_data,"what is the shortest flight", 1, 1, 0.1, 10000,20)

In [None]:
optimise(atis_data,"what is the first flight from washington to washington", 1, 1, 0.1, 10000,20)

In [None]:
optimise(atis_data,"what are the ground transport options", 1, 1, 0.1, 1000,20)

In [None]:
optimise(atis_data,"where can i go", 1, 1, 0.1, 1000,20)

In [None]:
atis_data["word_indexes"][START_MARKER]
    

In [None]:
`git push""` |> run

In [None]:
`git commit -m="" -a` |> run