In [None]:
addprocs(4)

In [None]:
import FunctionalCollections
import Iterators
import Pipe
import Compat
import JLD
@everywhere using FunctionalCollections
@everywhere using Iterators
@everywhere using Pipe
@everywhere using Compat
@everywhere using JLD

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

In [None]:
#shuffled_indexes = 1:length(ground_sents) |> collect |> shuffle!
#nfolds=10
#fold_indexes = Vector{Int}[
#    shuffled_indexes[(ii-1)*end÷nfolds + 1: ii*end÷nfolds]
#    for ii in 1:nfolds]

#@save("brown_glove_folds.jld", fold_indexes)

In [None]:
@everywhere fold_indexes=load("brown_glove_folds.jld","fold_indexes")

@everywhere function fold_split(fold_ii, raw_bow_res)
    ground_sents = Vector{ASCIIString}[rset[1] for rset in raw_bow_res]
    reconstructed_bows = Vector{ASCIIString}[rset[2] for rset in raw_bow_res]
    
    test_indexes = fold_indexes[fold_ii]
    training_indexes = trues(ground_sents)
    training_indexes[test_indexes]=false

    test_unordered_sents = reconstructed_bows[fold_indexes[fold_ii]]
    test_ground = ground_sents[fold_indexes[fold_ii]]
    training_sents = ground_sents[fold_indexes[fold_ii]]
    test_unordered_sents,test_ground, training_sents
end
    

In [None]:
import PyCall
@everywhere using PyCall
#http://www.nltk.org/howto/probability.html
@everywhere @pyimport nltk
@everywhere @pyimport nltk.probability as nltk_prob

@everywhere function train_language_model{T}(train_corpus::Vector{Vector{T}})
    function py_collect(xs::PyObject)
        xst = []
        for x in xs
            push!(xst,x)
        end
        xst
    end
    function trigram_buffer(sent)
        [START_MARKER1, START_MARKER2, sent..., END_MARKER1, END_MARKER2] 
    end

    training_trigrams = vcat([py_collect(nltk.trigrams(trigram_buffer(sent))) for sent in train_corpus]...)
    kn_prob_dist = nltk_prob.KneserNeyProbDist(pycall(nltk_prob.FreqDist, PyObject, training_trigrams))
    
    function trigram_model(given1::S, given2::S, event::S)
        kn_prob_dist[:prob]((given1, given2, event))
    end
end

In [None]:
@everywhere const START_MARKER1 = "**START1**"
@everywhere const START_MARKER2 = "**START2**"
@everywhere const END_MARKER1 = "**END1**"
@everywhere const END_MARKER2 = "**END2**"


In [None]:
@everywhere typealias S ASCIIString
@everywhere typealias State{T} Tuple{T,T}
@everywhere typealias OrderOptionsCache Dict{Tuple{State{S}, Vector{S}}, Vector{Tuple{plist{S}, Float64}}}


#"""
#returns all possible orderings of the remaining words.
#the freewords are also inserted into every possible position, EXCEPT at the end.
#Freewords also will not be inserted after other freewords
#(For now)
#"""
@everywhere function get_all_orders(unordered_words::Vector{S}, languauge_model::Function; beam_width=Inf)
    _get_options_cache = OrderOptionsCache()

    function transition_prob(cur_state::State{S}, next_word::S)
        languauge_model(cur_state[1],cur_state[2], next_word)
    end
    
    
    function get_options(state::State{S}, remaining_words)
        if length(remaining_words)>0
            get!(_get_options_cache, (state, remaining_words)) do
                _get_options(state, remaining_words)
            end
        else
            _get_options(state)
        end
    end
    
    function _get_options(cur_state::State{S})
        tp = transition_prob(cur_state, END_MARKER1)
        # Given P(END_MARKER2 | curstatep[2]==END_MARKER1) = 1.0
        # Do not need to consider P(END_MARKER2 | curstatep[2]==END_MARKER1, curstatep[1])
        [(EmptyList{S}(), tp)]
    end
    
    function _get_options(cur_state::State{S}, remaining_words)
        function inner()
            (@task begin
                branches = Tuple{Int, Float64}[(ii,transition_prob(cur_state, next_word)) 
                                                for (ii, next_word) in enumerate(remaining_words)]
                beam_end = min(length(remaining_words), beam_width)
                beam = select!(branches, 1:beam_end, by=ip->ip[2], rev=true)
                
                for (next_word_ii, tp) in beam
                    if tp==0.0
                        break #There are No good solutions left (as it is sorted best first)
                    end
                    
                    @inbounds word = remaining_words[next_word_ii]
                    new_remaining_words = sub(remaining_words,[1:next_word_ii-1; next_word_ii+1:length(remaining_words)])
                    @inbounds next_state = (cur_state[2],word)
                    tails_and_tailprobs = get_options(next_state, new_remaining_words) #Actually doing a beam depth-first
                    
                    for (tail, tailprob) in tails_and_tailprobs
                        total_prob = tp*tailprob
                        if total_prob>0.0
                            produce(cons(word, tail), total_prob)
                        end
                    end
                end
            end)
        end
        
        inner() |> collect
    end
    
    initial_state = (START_MARKER1, START_MARKER2)
    get_options(initial_state, unordered_words)
end

In [None]:
function order(unordered_words::Vector{S}, language_model::Function; beam_width=Inf, best_n=1::Int)
    orders_and_probs = get_all_orders(free_words, unordered_words, beam_width)
    best_n = min(best_n, length(orders_and_probs))
    if best_n==0 #None found
        #warn("No possible Orderding found. Defaulting to unordered: ".*string(unordered_words))
        Tuple[(unordered_words, 0.0)]
    elseif best_n==1 #4x as fast as the else
        max_prob, max_ii = @pipe orders_and_probs |> map(op->op[2],_) |> findmax
        Tuple[orders_and_probs[max_ii]]
    else
        select!(orders_and_probs, 1:best_n, by=op->-op[2] )
    end
end

function norm_order(unordered_words::Vector{S}, language_model::Function;  kwargs...)
    orders_and_probs = order(unordered_words, language_model; kwargs...)
    total_prob = @pipe orders_and_probs |> map(op->op[2], _ ) |> sum
    total_prob = total_prob == 0.0 ? 1.0 : total_prob
    [(join(order, " "), prob/total_prob) for (order,prob) in orders_and_probs]
end

@everywhere function best_order(unordered_words::Vector{S}, language_model::Function;  kwargs...)
    orders_and_probs = get_all_orders(unordered_words, language_model; kwargs...)
    if length(orders_and_probs)==0
        return (unordered_words, 0.0)
    end
        
    total_prob = Pipe.@pipe orders_and_probs |> map(op->op[2], _ ) |> sum
    max_prob, max_ii = Pipe.@pipe orders_and_probs |> map(op->op[2],_) |> findmax
    order, prob = orders_and_probs[max_ii]
    @assert(prob==max_prob)
    order, prob/total_prob
end
    

In [None]:

function test(fold_ii)
    test_bows, test_ground_order, train = fold_split(fold_ii,raw_bow_res)
    r_language_models =Dict([pid=>remotecall(pid, train_language_model, train) for pid in workers()])
    pmap([("a","man", "did"),("the","woman", "did")], err_stop=true) do abc
        lm = fetch(r_language_models[myid()])
        lm(abc...)
        
    end
end

test(2)

In [None]:
@everywhere const len_cap = 18
@everywhere raw_bow_res = load("results/bags/brown_glove300_res.jld", "res")

function process_fold(fold_ii)
    test_bows, test_ground_order, train = fold_split(fold_ii,raw_bow_res)
    
    #Avoid "serialising a pointer" by getting each process to create their own copy of Language model
    #This also means that the language models are independent (as under the hood they are not readonly, readomg them changes them as they have cache)
    r_language_models =Dict([pid=>remotecall(pid, train_language_model, train) for pid in workers()])
    pmap(test_ground_order, test_bows, err_stop=true) do ground_order, bow
        lm = fetch(r_language_models[myid()])
        generated_order, prob = if length(ground_order)<=len_cap
            best_order(bow, lm; beam_width=5)
        else
            (bow, NaN)
        end
        (ground_order, generated_order, prob)
    end
end

jldopen("brown_glove300_ordered.jld", "w") do file
    for fold_ii in 1:length(fold_indexes)
        res = process_fold(fold_ii)
        write(file, "fold_$(fold_ii)", res)
    end
end


In [None]:
@load "brown_glove300_ordered.jld"

In [None]:
fold_1

In [None]:
test_bows, test_ground_order, train = fold_split(2,raw_bow_res)
language_model = train_language_model(train)



In [None]:
@time ord, prob = best_order(test_bows[4110], language_model; beam_width=5)

In [None]:
res = pmap(test_bow, err_stop=true) do bow
    order, prob = best_order(bow, language_model; beam_width=5)
    (target_sent, sol, score)
end


In [None]:
5^20

In [None]:
@pipe test |> map(x->length(x)==50, _) |> find

In [None]:
@time best_order(test[60], language_model, beam_width=5)

In [None]:
ground_sents[16]

In [None]:
training_trigrams = [nltk.trigrams(sent)|>collect for sent in train]


In [None]:
map(examples) do unordered_words
    order(unordered_words, ASCIIString[])
    end

In [None]:
short_cases = Bool[length(ws) <=20 for ws in unordered_output]

true_ordered_sents = test_set[short_cases]
ordered_sents_and_probs = pmap(unordered_output[short_cases]) do unordered_words
    order(unordered_words, zeroed_words)
end

In [None]:
ordered_sents = map(op->op[1], ordered_sents_and_probs, be)

In [None]:
perfect_matches = Bool[]  
for ii in 1:length(ordered_sents)
    ordered_words = ordered_sents[ii]
    actual_words = true_ordered_sents[ii]
    
    match = ordered_words == actual_words
    push!(perfect_matches, match)
    #println("$ii - $match")
end
mean(perfect_matches)

In [None]:
@pyimport nltk
@pyimport nltk.translate.bleu_score as nltk_bleu

function bleu_score(candidate, reference)
    reference = reference |> collect
    candidate = candidate |> collect
    
    if reference==candidate #Perfect Match
        1.0
    else
        weights = [1,1,1,1]/4
        nltk_bleu.bleu(Any[reference],candidate, weights)
    end
end

In [None]:
map(bleu_score, ordered_sents,true_ordered_sents) |> mean


In [None]:
bleu_score(true_ordered_sents[3], true_ordered_sents[3])

In [None]:
true_ordered_sents

In [None]:
ordered_sents

In [None]:
ordered_sents[50]

In [None]:
unordered_output[short_cases][eval_cases][50]

In [None]:
true_ordered_sents[50]

In [None]:
#x= ["A", "B", "C", "D"]
#y= UTF8String["A", "B", "C", "D"]
x = true_ordered_sents[10]
y=ordered_sents[10] |> collect
pycall(nltk_bleu.bleu, PyAny, Any[x], y, Any[0.25, 0.25, 0.25, 0.25])

In [None]:
Any[true_ordered_sents[1]]


In [None]:
nltk_bleu._modified_precision(Any[reference1, reference2, reference3],candidate1, 1)

In [None]:
@pyimport pdb

In [None]:
pdb.runcall(nltk_bleu._modified_precision, [reference1, reference2, reference3],candidate1, 1)

In [None]:
# I think I have to reimplement BLEU in julia as for some reason it does not play nice with PyCall
# Can basically port http://www.nltk.org/_modules/nltk/align/bleu_score.html#bleu

In [None]:
pycall(nltk_bleu.bleu, Int, candidate1, [reference1], weights)

In [None]:
@pyimport nltk.util as nltk_util
nltk_util.ngrams(candidate1,2) |> py_collections.Counter

In [None]:
@pyimport collections as py_collections

In [None]:
@pyimport nltk.util as nltk_util
ngs = nltk_util.ngrams(candidate1,2)
pycall(py_collections.Counter, PyObject, ngs)

In [None]:
import nltk