In [2]:
using FunctionalCollections
using Iterators
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [3]:
const START_MARKER1 = "**START1**"
const START_MARKER2 = "**START2**"
const END_MARKER1 = "**END1**"
const END_MARKER2 = "**END2**"

atis_data = open("atis_data2.jsz","r") do fh
    deserialize(fh)    
end


Dict{ASCIIString,Any} with 5 entries:
  "length_prob"   => [0.00274146,0.0142225,0.0332953,0.0543865,0.0725589,0.0850…
  "LL"            => 300x1120 Array{Float64,2}:…
  "indexed_words" => AbstractString["in","for","that","is","on","with","said","…
  "trigrams"      => Any[("**START1**","**START2**","what"),("**START2**","what…
  "word_indexes"  => Dict{AbstractString,Int64}("carried"=>502,"diabetic"=>927,…

In [4]:
using PyCall
#http://www.nltk.org/howto/probability.html
@pyimport nltk
@pyimport nltk.probability as nltk_prob


kn_prob_dist = nltk_prob.KneserNeyProbDist(pycall(nltk_prob.FreqDist, PyObject, atis_data["trigrams"]))
function trigram_model(given1::AbstractString, given2::AbstractString, event::AbstractString)
    kn_prob_dist[:prob]((given1, given2, event))
end



trigram_model (generic function with 1 method)

In [20]:
zeroed_words = ["and", "a", "of", "to"]

4-element Array{ASCIIString,1}:
 "and"
 "a"  
 "of" 
 "to" 

In [5]:
example1 = ["show", "cheapest", "flight"]
example2 = ["francisco","washington","flight","from","cheapest","san","show"]
example3 = [example2..., "to"]
example4 = [example2..., "and", "shortest"]

9-element Array{ASCIIString,1}:
 "francisco" 
 "washington"
 "flight"    
 "from"      
 "cheapest"  
 "san"       
 "show"      
 "and"       
 "shortest"  

In [35]:
typealias S ASCIIString
typealias State{T} Tuple{T,T}

function transition_prob(cur_state::State{S}, next_word::S)
    trigram_model(cur_state..., next_word)
end




_get_options_cache_type = Dict{Tuple{State{S}, Any}, Vector{Tuple{plist{S}, Float64}}}
_get_options_caches = Dict{Vector{S}, _get_options_cache_type}()

"""
returns all possible orderings of the remaining words.
the freewords are also inserted into every possible position, EXCEPT at the end.
Freewords also will not be inserted after other freewords
(For now)
"""
function get_all_orders(free_words::Vector{S}, unordered_words::Vector{S})
    _get_options_cache = get!(_get_options_cache_type, _get_options_caches, free_words)
    #different cache if different freewords
    
    function get_options(cur_state::State{S})
        #Can never End in a freeword
        tp = transition_prob(cur_state, END_MARKER1)
        # Given P(END_MARKER2 | curstatep[2]==END_MARKER1) = 1.0
        # Do not need to consider P(END_MARKER2 | curstatep[2]==END_MARKER1, curstatep[1])
        [
            (EmptyList{S}(), tp)
        ]
    end
    
    function get_options(state::State{S}, remaining_words::S...)
        get!(_get_options_cache, (state, remaining_words)) do
            _get_options_cache[(state, remaining_words)] = _get_options(state, remaining_words...)
        end
    end

    function _get_options_inner(cur_state, word, remaining_words, get_option_fun)
        tp = transition_prob(cur_state, word) 
        next_state = (cur_state[2], word)
        
        if tp<=0.0 #Stop early, GOLDPLATE: take ϵ as a parameter for this
            return []
        end

        for (tail, next_tp) in get_option_fun(next_state, remaining_words...)
            produce(cons(word, tail),tp*next_tp)
        end
    end

    function _get_options_nofree(cur_state::State{S}, remaining_words::S...)
        (@task begin        
            for ii in 1:length(remaining_words)
                word = remaining_words[ii]
                new_remaining_words = remaining_words[[1:ii-1; ii+1:end]]
                
                _get_options_inner(cur_state, word, new_remaining_words, get_options)
            end
        end)
    end
    
    function _get_options_free(cur_state::State{S}, remaining_words::S...)
        @task begin        
            for word in free_words

                #Important: get_options_free, only chains to _get_options_nofree
                _get_options_inner(cur_state, word, remaining_words, _get_options_nofree)
            end
        end
    end
    
    function _get_options(cur_state::State{S}, remaining_words::S...)
        chain(_get_options_nofree(cur_state, remaining_words...),
             _get_options_free(cur_state, remaining_words...)) |> collect
    end
   
    initial_state = (START_MARKER1, START_MARKER2)
    get_options(initial_state, unordered_words...)
end

get_all_orders (generic function with 1 method)

In [36]:
function order(unordered_words::Vector{S}, best_n::Int, free_words=S[]::Vector{S})
    orders_and_probs = get_all_orders(free_words, unordered_words)
    best_n = min(best_n, length(orders_and_probs))
    select!(orders_and_probs, 1:best_n, by=op->-op[2] )
end

function order(unordered_words::Vector{S},free_words=S[]::Vector{S})
    order(unordered_words,1,free_words)[1][1]
end

order (generic function with 4 methods)

In [37]:
get_all_orders(S["the"], example1)

4-element Array{Tuple{FunctionalCollections.PersistentList{ASCIIString},Float64},1}:
 (ASCIIString(show, cheapest, flight),1.8261028957531455e-6)           
 (ASCIIString(show, flight, the, cheapest),3.338296258826786e-10)      
 (ASCIIString(show, the, cheapest, flight),6.2789039795265354e-6)      
 (ASCIIString(show, the, flight, the, cheapest),1.0545479122806868e-11)

In [38]:
order(example1,5, ["the"])

4-element Array{Tuple{FunctionalCollections.PersistentList{ASCIIString},Float64},1}:
 (ASCIIString(show, the, cheapest, flight),6.2789039795265354e-6)      
 (ASCIIString(show, cheapest, flight),1.8261028957531455e-6)           
 (ASCIIString(show, flight, the, cheapest),3.338296258826786e-10)      
 (ASCIIString(show, the, flight, the, cheapest),1.0545479122806868e-11)

In [39]:
order(example2,5, zeroed_words)

5-element Array{Tuple{FunctionalCollections.PersistentList{ASCIIString},Float64},1}:
 (ASCIIString(show, cheapest, flight, from, washington, to, san, francisco),2.428358368591127e-8)       
 (ASCIIString(show, cheapest, flight, from, san, francisco, to, washington),1.8420942102163784e-9)      
 (ASCIIString(show, cheapest, flight, from, washington, and, san, francisco),4.6966020913829104e-11)    
 (ASCIIString(show, cheapest, flight, from, san, francisco, and, washington),3.501936897770464e-11)     
 (ASCIIString(and, show, cheapest, flight, from, washington, to, san, francisco),3.5538569140616182e-12)