In [1]:
using FunctionalCollections
using Iterators
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [2]:
addprocs(11)

11-element Array{Int64,1}:
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12

In [3]:
@everywhere using JLD
@everywhere data = load("brown_glove300.jld")
test_set = data["corpus"]



42004-element Array{Array{ASCIIString,1},1}:
 ASCIIString["``","only","a","relative","handful","of","such","reports","was","received"  …  "of","voters","and","the","size","of","this","city","''","."]                       
 ASCIIString["it","recommended","that","fulton","legislators","act","``","to","have","these"  …  "to","the","end","of","modernizing","and","improving","them","''","."]          
 ASCIIString["the","grand","jury","commented","on","a","number","of","other","topics"  …  "inure","to","the","best","interest","of","both","governments","''","."]               
 ASCIIString["merger","proposed"]                                                                                                                                                
 ASCIIString["however",",","the","jury","said","it","believes","``","these","two"  …  "greater","efficiency","and","reduce","the","cost","of","administration","''","."]         
 ASCIIString["the","city","purchasing","department",",","the","ju

In [4]:
@everywhere using Distances



In [5]:
@everywhere function lookup_sowe(data, sent::AbstractString)
    lookup_sowe(data, sent |> split)
end

@everywhere function lookup_sowe{S<:AbstractString}(data, sent::Vector{S})
    sum([data["LL"][:,data["word_indexes"][word]] for word in sent]) 
end

@everywhere function lookup_words(data, path)
    [data["indexed_words"][ii] for ii in path]
end

In [6]:
@everywhere const ϵ = 10.0^-6

In [7]:
@everywhere @inline function get_end(LL, ws::AbstractVector{Int})
    @inbounds sofar = length(ws)>0 ? sum([sub(LL,(:,ii)) for ii in ws]) : zeros(size(LL,1))
    sofar
end

@everywhere function score_possible_additions(LL, target, end_point)
    -pairwise(Euclidean(), LL, (target-end_point)'');
end   

@everywhere  @inline function fitness(target, end_point)
    #Fitter is larger
    -euclidean(end_point, target)   
end

@everywhere function greedy_addition(LL::Matrix{Float64},
                         target::Vector{Float64},
                         initial_word_set::AbstractVector{Int},
                         max_additions = Inf)
    best_word_set = convert(Vector{Int},initial_word_set)
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    
    cur_additions = 0
    while(cur_additions<max_additions)   
        cur_additions+=1
        addition_scores = score_possible_additions(LL, target, end_point)
        addition_score, addition = findmax(addition_scores)
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            push!(best_word_set, addition)
            end_point += sub(LL,(:,addition))
        else 
            break
        end
    end
    best_word_set,best_score
end


@everywhere function word_swap_refinement(LL::Matrix{Float64},
                              target::Vector{Float64},
                              initial_word_set::AbstractVector{Int})
    
    best_word_set = copy(initial_word_set)
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    
    function update_best!(word_set,score)
        if score>best_score+ϵ #scores are negative
            best_score=score
            best_word_set = word_set
            #println("*swap, new set: $word_set $score")
        end
    end
    n_words_initial = length(initial_word_set)
    for ii in 1:n_words_initial-1 #Don't need to consider last word added as it was added greedily
        word_set = sub(initial_word_set,[1:ii-1; ii+1:n_words_initial])
        sub_endpoint = end_point - sub(LL,(:,ii))
        subset_score = fitness(target, sub_endpoint)
        update_best!(word_set, subset_score)

        add_word_set, add_score = greedy_addition(LL, target, word_set, 1) #Try adding just one greedily
        update_best!(add_word_set, add_score)
    end

    best_word_set,best_score
        
end


@everywhere function greedy_search(data::Dict, target::Vector{Float64}; rounds=1000, log=false)
    get_words(word_iis) = ASCIIString[data["indexed_words"][ii] for ii in word_iis]
    
    word_iis = Int[]
    best_score=-Inf
    for round in 1:rounds
        word_iis, add_score = greedy_addition(data["LL"], target, word_iis)
        log && println("POST_ADD_STEP: $add_score $(get_words(word_iis))")
        @assert add_score + ϵ >= best_score
        
        if add_score>= 0.0 
            best_score = add_score
            break 
        end        
        
        
        word_iis, swap_score = word_swap_refinement(data["LL"], target, word_iis)
        log && println("POST_SWAP_STEP: $swap_score $(get_words(word_iis))")
        @assert swap_score + ϵ >= add_score
        
        if best_score - ϵ<swap_score<best_score + ϵ || swap_score>=0.0
            break 
        end       
        
        best_score=swap_score
    end
    get_words(word_iis),best_score
end



@everywhere function greedy_search{S<:AbstractString}(data::Dict, target_sent::Vector{S}; kwargs...)
    target::Vector{Float64} = lookup_sowe(data,target_sent)
    greedy_search(data, target; kwargs...)
end

In [None]:
res = pmap(test_set) do target_sent
    sol, score = greedy_search(data, target_sent, rounds=10_000, log=false)
    (target_sent, sol, score)
end

In [None]:
using JLD
@save "brown_glove300_res.jld" res

In [None]:
###############
#experimental
###############

In [None]:
target_sent = test_set[664+6]
target_sowe = lookup_sowe(data,target_sent)
println(target_sent)
println("-------------")
best_words, best_score  = greedy_search(data, target_sent, log=true)


In [None]:
# Shorter Longer Refinement, for replaceing n words with m other words
# Either the words in our current set are too short, so we can limit our search to words that are longer than the sum of our shortest
# Or too long then we do the reverse

#We are removing something, and planning to get closer to the path by adding something
#things longer by adding a finite number more elements
function make_longer_refinement2(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int},
                              remove_indexes::Vector{Int},
                              m_add::Int)
    @assert(m_add>=0)
    
    rem_indexes = trues(best_word_set)
    rem_indexes[remove_indexes] = false
    
    cur_word_set = best_word_set[rem_indexes]
    cur_end_point = get_end(LL, cur_word_set)

    cur_lengths = -score_possible_additions(LL[:,best_word_set], target, get_end(LL, Int[])) |> vec

    length_removed = -fitness(target, get_end(LL,best_word_set)) -fitness(target, cur_end_point)
        
    @printval length_removed
    LL_scores = -score_possible_additions(m_add*LL, target, get_end(LL, Int[]))
    
    valid_replacers = find((m_add-1)*maximum(LL_scores) + LL_scores.>=length_removed)   
    valid_replacers
end



In [None]:
make_longer_refinement2(data["LL"], target_sowe, best_wordset, [1:39;], 2)

In [None]:
data["indexed_words"][40482]

In [None]:
342/3

In [None]:
LLscores = -score_possible_additions(3*data["LL"], target_sowe, get_end(data["LL"], Int[]))
find(LLscores.<120)

In [None]:
@everywhere function greedy_addition_with_backtrack_prep(LL::Matrix{Float64},
                         target::Vector{Float64},
                         best_word_set::Vector{Int},
                         keep_per_word = 50,
                         max_additions = Inf, 
                         
    )
    
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    good_ideas = Set{Int}()
    sizehint!(good_ideas, 10*keep_per_word) #Most sentences in most corpora have at least 10 words
    did_improve = true
    cur_additions = 0
    while(did_improve && cur_additions<max_additions)
        
        cur_additions+=1
        did_improve=false
        
        addition_scores = score_possible_additions(LL, target, end_point)
        
        high_scores = select!(collect(enumerate(addition_scores)), 1:keep_per_word, by=ii_score->ii_score[2])
        
        union!(good_ideas, map(ii_score->ii_score[1],high_scores))
        
        addition,addition_score = high_scores[1]
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            best_word_set = [best_word_set...,addition]
            end_point += LL[:,addition]
            did_improve=true
        end
    end
    best_word_set,best_score,good_ideas
end

In [None]:
best_word_set,best_score,good_ideas  = greedy_addition_with_backtrack_prep(data["LL"], target_sowe, Int[], )

In [None]:
max

In [None]:
#@pipe good_ideas |> map(println,_)
length(good_ideas)

In [None]:
function word_delete_refinement(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int})
    
    initial_word_set = pset(best_word_set)
    initial_end_point = get_end(LL, best_word_set)
    best_score = fitness(target, initial_end_point)
    
    function inner(removed, endpoint)
        remaining =  setdiff(initial_word_set, removed)
        if length(remaining)<35
            return
        end
        
        for del_word in remaining
            new_endpoint = endpoint - LL[:,del_word]
            del_score = fitness(target, new_endpoint)
            
            if del_score>best_score + ϵ
                @inbounds best_word_set = remaining
                best_score = del_score
            end
            
            
            inner(removed, endpoint)
        end
        
    end
    
    inner(pset{Int}(),initial_end_point)

    best_word_set,best_score
        
end

In [None]:
word_delete_refinement(data["LL"], target_sowe, best_wordset)

In [None]:
length(best_wordset)

In [None]:
a

In [None]:
function double_word_swap_refinement(LL::Matrix{Float64},
                              LL2::Matrix{Float64}, LL2_index,
                              target::Vector{Float64},
                              best_word_set::Vector{Int})
    
    initial_word_set = copy(best_word_set)
    initial_end_point = get_end(LL, initial_word_set)
    best_score = fitness(LL, target, best_word_set)
    
    for ii in 1:length(initial_word_set)
        for jj in ii+1:length(initial_word_set)
            @inbounds end_point = initial_end_point - LL[:,initial_word_set[ii]] - LL[:,initial_word_set[jj]]
            addition_score, addition_index = findmax(score_possible_additions(LL2, target, end_point))
            if addition_score>best_score + ϵ
                #println("+= $(LL2_index[addition_index,:])")
                @inbounds word_subset = initial_word_set[[1:ii-1; ii+1:jj-1; jj+1:end]]
                @inbounds best_word_set = [word_subset..., LL2_index[addition_index,:]...]
                best_score = addition_score
            end
        end
    end

    best_word_set,best_score
        
end

function get_LL2(LL)
    len_LL2 = (size(LL,2).^2  + size(LL,2))÷ 2
    LL2_index = Matrix{Int}(len_LL2, 2)
    @printval(len_LL2)
    LL2 = Matrix{Float64}(size(LL,1), len_LL2)
    kk=0
    for ii in 1:size(LL,2)
        for jj in 1:ii
            kk+=1
            LL2_index[kk,:] = [ii,jj]
            LL2[:,kk] = LL[:,ii]+LL[:,jj]
        end
    end
    LL2, LL2_index
end


In [None]:
LL2=get_LL2(data["LL"])

In [None]:
data["LL"]

In [None]:
819416403*50*2/ 1000_000_000

In [None]:
target_sowe = lookup_sowe(data,target_sent)
@time new_best_words, new_best_score =double_word_swap_refinement(LL, LL2, LL2_index, target_sowe, best_wordset)


In [None]:
target_sent = test_set[12]
println(target_sent)
target_sowe = lookup_sowe(data,target_sent)

In [None]:
for ii in 1:10
    sample_sowe = (randn!(similar(target_sowe)) + target_sowe)
    print(greedy_search(data, sample_sowe, 1,100, log=false)[1])
    println(",")
end

In [None]:
res = pmap([1:length(test_set);], test_set) do ii,target_sent
    sol, score = greedy_search(data, target_sent, 1, 100, log=false)
    (sol, score, ii)
end

In [None]:
open("atis2_res_glove.jsz","w") do fh
    serialize(fh, (Vector{ASCIIString}[sol for (sol, score, ii) in res], test_set)
    )    
end



In [None]:
se = lookup_sowe(atis_data, ["flights","serve","lunch","which"])
se=se*0.6
greedy_search(atis_data, se, 1, 5, log=false)

In [None]:
hard_cases = find(x->x[2]<0, res)
res[hard_cases]

In [None]:
test_set[hard_cases]

In [None]:
hard_set =  test_set[hard_cases]

hard_res = pmap([1:length(hard_set);], hard_set) do ii,target_sent
    sol, score = greedy_search(atis_data, target_sent, 2,5, log=false)
    (sol, score, ii)
end

In [None]:
very_hard_cases = find(x->x[2]<0.0, hard_res)

In [None]:
factorial(14)

In [None]:
hard_res[very_hard_cases]

In [None]:


for (ii,target_sent) in enumerate(test_set)
    sol, score = greedy_search(atis_data, target_sent, log=false)
    if score>0
        print("$ii - ")
        println(join(target_sent, " "))
    end
end
    

In [None]:
enumerate(test_set) |> collect