In [None]:
using FunctionalCollections
using Iterators
using Pipe
using Compat

@everywhere macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

In [None]:
addprocs(11)

In [None]:
@everywhere using JLD
@everywhere data = load("results/data/books/books300d.jld")
corpus_filename = "test_books_corpus_0.001_of_test"
test_set = open(deserialize, "results/data/books/"*corpus_filename*".jsz")

In [None]:
#@everywhere using Distances

In [None]:
@everywhere function lookup_sowe(data, sent)
    lookup_sowe(data, sent |> split)
end

@everywhere function lookup_sowe{S}(data, sent::Vector{S})
    sum([data["LL"][:,data["word_indexes"][word]] for word in sent]) 
end

@everywhere function lookup_words(data, path)
    ASCIIString[data["indexed_words"][ii] for ii in path]
end

@everywhere function lookup_indexes{S}(data, sent::Vector{S})
    Int[data["word_indexes"][word] for word in sent]
end

In [None]:
@everywhere const ϵ = 10.0^-5

In [None]:
@everywhere @inline function get_end(LL, ws::AbstractVector{Int})
    @inbounds sofar = length(ws)>0 ? sum([sub(LL,(:,ii)) for ii in ws]) : zeros(size(LL,1))
    sofar
end

@everywhere function score_possible_additions(LL, target, end_point)
    #-(sumabs(LL.+(end_point.-target),1)) #City Block
    -sqrt(sumabs2(LL.+(end_point.-target),1)) #Eculidean
    
end   

@everywhere  @inline function fitness(target, end_point)
    #Fitter is larger
    #-sumabs(end_point.-target) #city block
    -norm(end_point.-target) #euclidean
    
end

@everywhere function greedy_addition{F<:AbstractFloat}(LL::Matrix{F},
                         target::Vector{F},
                         initial_word_set::AbstractVector{Int},
                         max_additions = Inf)
    best_word_set = convert(Vector{Int},initial_word_set)
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    
    cur_additions = 0
    while(cur_additions<max_additions)   
        cur_additions+=1
        addition_scores = score_possible_additions(LL, target, end_point)
        addition_score, addition = findmax(addition_scores)
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            push!(best_word_set, addition)
            end_point += sub(LL,(:,addition))
        else 
            break
        end
    end
    best_word_set,best_score
end


@everywhere function word_swap_refinement{F<:AbstractFloat}(LL::Matrix{F},
                              target::Vector{F},
                              initial_word_set::AbstractVector{Int})
    
    best_word_set = copy(initial_word_set)
    end_point = get_end(LL, initial_word_set)
    best_score = fitness(target, end_point)
    function update_best!(word_set,score)
        if score>best_score+ϵ #scores are negative
            best_score=score
            best_word_set = word_set
            #println("*swap, new set: $word_set $score")
        end
    end
    n_words_initial = length(initial_word_set)
    for ii in 1:n_words_initial-1 #Don't need to consider last word added as it was added greedily
        word_set = sub(initial_word_set,[1:ii-1; ii+1:n_words_initial])
        sub_endpoint = end_point - sub(LL,(:,initial_word_set[ii]))
        subset_score = fitness(target, sub_endpoint)
        update_best!(word_set, subset_score)
        

        add_word_set, add_score = greedy_addition(LL, target, word_set, 1) #Try adding just one greedily
        update_best!(add_word_set, add_score)
    end

    best_word_set,best_score
        
end


@everywhere function greedy_search{F<:AbstractFloat}(data::Dict, target::Vector{F}; rounds=1000, log=false)
    get_words(word_iis) = [data["indexed_words"][ii] for ii in word_iis]
    
    word_iis = Int[]
    best_word_iis = word_iis
    best_score=-Inf
    for round in 1:rounds
        word_iis, add_score = greedy_addition(data["LL"], target, word_iis)
        log && println("POST_ADD_STEP: $add_score $(get_words(word_iis))")
        @assert add_score + ϵ >= best_score || best_word_iis == word_iis "$add_score vs $best_score $(get_words(word_iis))"
        best_word_iis = word_iis
        
        if add_score>= 0.0 
            best_score = add_score
            break 
        end        
        

        
        word_iis, swap_score = word_swap_refinement(data["LL"], target, word_iis)
        log && println("POST_SWAP_STEP: $swap_score $(get_words(word_iis))")
        @assert swap_score + ϵ >= add_score || best_word_iis == word_iis
        best_word_iis = word_iis
        
        converged = best_score - ϵ<swap_score<best_score + ϵ || swap_score>=-ϵ
        best_score=swap_score
        if converged
            break 
        end       
    end
    get_words(word_iis),best_score
end



@everywhere function greedy_search{S}(data::Dict, target_sent::Vector{S}; kwargs...)
    target = lookup_sowe(data,target_sent)
    greedy_search(data, target; kwargs...)
end

In [None]:
test_sentence = ASCIIString["a","few","folks","around","here","dont","like","you",",","you","know",",","jake",",","said","the","pastor","."]
test_sentence = map(symbol,test_sentence)
@time greedy_search(data_sym, test_sentence, rounds=10_000, log=true)

In [None]:
gc()

In [None]:
using Blocks
using Lumberjack
add_truck(LumberjackTruck("selection.log"), "file-logger")

In [None]:
@

In [None]:
test_set_blocks = Block(test_set, 1, 100)

In [None]:
function jldopen_append(func::Function, filename::AbstractString)
    mode = isfile(filename) ? "r+" : "w" #Only open with "w" if it does't already exist
    jldopen(func, filename, mode)
end

In [None]:
const res_type = Tuple{Array{ASCIIString,1},Array{ASCIIString,1},Float32}
function run(save_path="selection.jld")
    try
        Lumberjack.info("Began selection")
        ii = 0 
        map(test_set_blocks) do block
            net_score = 0.0f0

            block_res::Vector{res_type} = pmap(block,err_stop=true) do target_sent
                target_sent=map(string, target_sent)
                sol, score = greedy_search(data, target_sent, rounds=10_000, log=false)
                net_score+=score
                (target_sent, sol, score)
            end
            ii+=1
            avg_score = net_score/length(block)
            Lumberjack.info("$ii done: $avg_score")
            jldopen_append(save_path) do fh
                write(fh,string(ii), block_res)
            end
            Lumberjack.debug("$ii written to disc")
        end
    catch err
        Lumberjack.error("Unhandled Error", base_exception=err)
    end
    Lumberjack.info("complete selection")
end


In [None]:
run("results/bags/books_corpus_0.001_of_test_glove300.jld")

In [None]:
test_sentence = ASCIIString["a","few","folks","around","here","dont","like","you",",","you","know",",","jake",",","said","the","pastor","."]
@time greedy_search(data, map(string,test_sentence), rounds=10_000, log=true)

In [None]:
gc()

In [None]:
data["LL"][:,data["word_indexes"]["_____________"]] |> norm

In [None]:
@save "results/data/books/books300d_sym.jld" data_sym

In [None]:
?@save

In [None]:
[a=>2*b for (a,b) in [(1,2),(4,5)]]

In [None]:
res = pmap(test_set[1:20], err_stop=true) do target_sent
#res = map(test_set) do target_sent
    target_sent=map(string, target_sent)
    sol, score = greedy_search(data, target_sent, rounds=10_000, log=false)
    (target_sent, sol, score)
end

In [None]:
using JLD
@save "results/bags/books300d.jld" res

In [None]:
res[1] |> typeof |> typeof

In [None]:
target_sent = test_set[67]
target_sowe = lookup_sowe(data,target_sent)
println(target_sent)
println("-------------")
best_words, best_score  = greedy_search(data, target_sent, log=true)

In [None]:
#########
# Experimental
#####

In [None]:
methodswith(pset)

In [None]:
a=push(pset(),3)
union(a, 5,4) |> length


In [None]:
function word_delete_refinement(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int},
                              max_remove = 2
        )
    
    initial_word_set = pset(best_word_set)
    initial_end_point = get_end(LL, best_word_set)
    best_score = fitness(target, initial_end_point)
    
    function inner(removed, endpoint)
        #println(removed)
        if length(removed)>max_remove
            return
        end
        remaining =  setdiff(initial_word_set, removed)
        
        for del_word in remaining
            new_endpoint = endpoint - sub(LL,(:,del_word))
            del_score = fitness(target, new_endpoint)
            
            if del_score>best_score + ϵ
                @inbounds best_word_set = remaining
                best_score = del_score
            end
            
            
            inner(push(removed, del_word), endpoint)
        end
        
    end
    
    inner(pset{Int}(),initial_end_point)

    best_word_set,best_score
        
end

In [None]:
before_words = ["not","after","said","election","to","the",",","calls","williams","due","bill", "anonymous",",","subjected","opens","consisted","snodgrass",".","folks","meanwhile","truculent","was","soon", "he"]
before_word_iis = lookup_indexes(data, before_words)
fitness(target_sowe, get_end(data["LL"], before_word_iis))
word_delete_refinement(data["LL"], target_sowe, before_word_iis, 2)



In [None]:
fitness(target_sowe, get_end(data["LL"], before_word_iis))

In [None]:
# Shorter Longer Refinement, for replaceing n words with m other words
# Either the words in our current set are too short, so we can limit our search to words that are longer than the sum of our shortest
# Or too long then we do the reverse

#We are removing something, and planning to get closer to the path by adding something
#things longer by adding a finite number more elements
function make_longer_refinement2(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int},
                              remove_indexes::Vector{Int},
                              m_add::Int)
    @assert(m_add>=0)
    
    rem_indexes = trues(best_word_set)
    rem_indexes[remove_indexes] = false
    
    cur_word_set = best_word_set[rem_indexes]
    cur_end_point = get_end(LL, cur_word_set)

    cur_lengths = -score_possible_additions(LL[:,best_word_set], target, get_end(LL, Int[])) |> vec

    length_removed = -fitness(target, get_end(LL,best_word_set)) -fitness(target, cur_end_point)
        
    @printval length_removed
    LL_scores = -score_possible_additions(m_add*LL, target, get_end(LL, Int[]))
    
    valid_replacers = find((m_add-1)*maximum(LL_scores) + LL_scores.>=length_removed)   
    valid_replacers
end



In [None]:
make_longer_refinement2(data["LL"], target_sowe, best_wordset, [1:39;], 2)

In [None]:
data["indexed_words"][40482]

In [None]:
342/3

In [None]:
LLscores = -score_possible_additions(3*data["LL"], target_sowe, get_end(data["LL"], Int[]))
find(LLscores.<120)

In [None]:
@everywhere function greedy_addition_with_backtrack_prep(LL::Matrix{Float64},
                         target::Vector{Float64},
                         best_word_set::Vector{Int},
                         keep_per_word = 50,
                         max_additions = Inf, 
                         
    )
    
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    good_ideas = Set{Int}()
    sizehint!(good_ideas, 10*keep_per_word) #Most sentences in most corpora have at least 10 words
    did_improve = true
    cur_additions = 0
    while(did_improve && cur_additions<max_additions)
        
        cur_additions+=1
        did_improve=false
        
        addition_scores = score_possible_additions(LL, target, end_point)
        
        high_scores = select!(collect(enumerate(addition_scores)), 1:keep_per_word, by=ii_score->ii_score[2])
        
        union!(good_ideas, map(ii_score->ii_score[1],high_scores))
        
        addition,addition_score = high_scores[1]
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            best_word_set = [best_word_set...,addition]
            end_point += LL[:,addition]
            did_improve=true
        end
    end
    best_word_set,best_score,good_ideas
end

In [None]:
best_word_set,best_score,good_ideas  = greedy_addition_with_backtrack_prep(data["LL"], target_sowe, Int[], )

In [None]:
max

In [None]:
#@pipe good_ideas |> map(println,_)
length(good_ideas)

In [None]:
word_delete_refinement(data["LL"], target_sowe, best_wordset)

In [None]:
length(best_wordset)

In [None]:
a

In [None]:
function double_word_swap_refinement(LL::Matrix{Float64},
                              LL2::Matrix{Float64}, LL2_index,
                              target::Vector{Float64},
                              best_word_set::Vector{Int})
    
    initial_word_set = copy(best_word_set)
    initial_end_point = get_end(LL, initial_word_set)
    best_score = fitness(LL, target, best_word_set)
    
    for ii in 1:length(initial_word_set)
        for jj in ii+1:length(initial_word_set)
            @inbounds end_point = initial_end_point - LL[:,initial_word_set[ii]] - LL[:,initial_word_set[jj]]
            addition_score, addition_index = findmax(score_possible_additions(LL2, target, end_point))
            if addition_score>best_score + ϵ
                #println("+= $(LL2_index[addition_index,:])")
                @inbounds word_subset = initial_word_set[[1:ii-1; ii+1:jj-1; jj+1:end]]
                @inbounds best_word_set = [word_subset..., LL2_index[addition_index,:]...]
                best_score = addition_score
            end
        end
    end

    best_word_set,best_score
        
end

function get_LL2(LL)
    len_LL2 = (size(LL,2).^2  + size(LL,2))÷ 2
    LL2_index = Matrix{Int}(len_LL2, 2)
    @printval(len_LL2)
    LL2 = Matrix{Float64}(size(LL,1), len_LL2)
    kk=0
    for ii in 1:size(LL,2)
        for jj in 1:ii
            kk+=1
            LL2_index[kk,:] = [ii,jj]
            LL2[:,kk] = LL[:,ii]+LL[:,jj]
        end
    end
    LL2, LL2_index
end


In [None]:
LL2=get_LL2(data["LL"])

In [None]:
data["LL"]

In [None]:
819416403*50*2/ 1000_000_000

In [None]:
target_sowe = lookup_sowe(data,target_sent)
@time new_best_words, new_best_score =double_word_swap_refinement(LL, LL2, LL2_index, target_sowe, best_wordset)


In [None]:
target_sent = test_set[12]
println(target_sent)
target_sowe = lookup_sowe(data,target_sent)

In [None]:
for ii in 1:10
    sample_sowe = (randn!(similar(target_sowe)) + target_sowe)
    print(greedy_search(data, sample_sowe, 1,100, log=false)[1])
    println(",")
end

In [None]:
res = pmap([1:length(test_set);], test_set) do ii,target_sent
    sol, score = greedy_search(data, target_sent, 1, 100, log=false)
    (sol, score, ii)
end

In [None]:
open("atis2_res_glove.jsz","w") do fh
    serialize(fh, (Vector{ASCIIString}[sol for (sol, score, ii) in res], test_set)
    )    
end



In [None]:
se = lookup_sowe(atis_data, ["flights","serve","lunch","which"])
se=se*0.6
greedy_search(atis_data, se, 1, 5, log=false)

In [None]:
hard_cases = find(x->x[2]<0, res)
res[hard_cases]

In [None]:
test_set[hard_cases]

In [None]:
hard_set =  test_set[hard_cases]

hard_res = pmap([1:length(hard_set);], hard_set) do ii,target_sent
    sol, score = greedy_search(atis_data, target_sent, 2,5, log=false)
    (sol, score, ii)
end

In [None]:
very_hard_cases = find(x->x[2]<0.0, hard_res)

In [None]:
factorial(14)

In [None]:
hard_res[very_hard_cases]

In [None]:


for (ii,target_sent) in enumerate(test_set)
    sol, score = greedy_search(atis_data, target_sent, log=false)
    if score>0
        print("$ii - ")
        println(join(target_sent, " "))
    end
end
    

In [None]:
enumerate(test_set) |> collect