In [1]:
using Iterators
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [2]:
addprocs(11)

11-element Array{Int64,1}:
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12

In [3]:
@everywhere using Distances



In [None]:

@everywhere data = open("doctors_letters.jsz","r") do fh
    deserialize(fh)    
end
test_set = data["test_set"];
data

In [4]:
@everywhere data = open("atis_data2.jsz","r") do fh
    deserialize(fh)    
end
test_set = data["test_set"];
data

Dict{ASCIIString,Any} with 7 entries:
  "zeroed_words"  => ASCIIString[]
  "length_prob"   => [0.00273293,0.0141963,0.0332591,0.054354,0.0725396,0.08505…
  "LL"            => 300x1126 Array{Float64,2}:…
  "indexed_words" => AbstractString["the","of","to","and","in","a","for","that"…
  "test_set"      => [SubString{ASCIIString}["what","is","the","cheapest","way"…
  "trigrams"      => Any[("**START1**","**START2**","what"),("**START2**","what…
  "word_indexes"  => Dict{AbstractString,Int64}("carried"=>452,"diabetic"=>1069…

In [6]:
@everywhere data = open("brown_data.jsz","r") do fh
    deserialize(fh)    
end
test_set = data["test_set"];
data

Dict{ASCIIString,Any} with 7 entries:
  "zeroed_words"  => ASCIIString[]
  "length_prob"   => [0.00631022,0.013634,0.0204009,0.0262119,0.0309737,0.03471…
  "LL"            => 300x40482 Array{Float64,2}:…
  "indexed_words" => AbstractString["the",",",".","of","to","and","in","a","for…
  "test_set"      => [ASCIIString["``","only","a","relative","handful","of","su…
  "trigrams"      => Any[("**START1**","**START2**","edison"),("**START2**","ed…
  "word_indexes"  => Dict{AbstractString,Int64}("baleful"=>34074,"confined"=>77…

In [None]:
@everywhere function lookup_sowe(data, sent::AbstractString)
    lookup_sowe(data, sent |> split)
end

@everywhere function lookup_sowe{S<:AbstractString}(data, sent::Vector{S})
    sum([data["LL"][:,data["word_indexes"][word]] for word in sent]) 
end

@everywhere function lookup_words(data, path)
    [data["indexed_words"][ii] for ii in path]
end

In [None]:
@everywhere const ϵ = 10.0^-6

In [None]:
@everywhere @inline function get_end(LL, ws::Vector{Int64})
    @inbounds sofar = length(ws)>0 ? sum([LL[:,ii] for ii in ws]) : zeros(LL[:,1])
    sofar
end

@everywhere function score_possible_additions(LL, target, end_point)
    -pairwise(Euclidean(), LL, (target-end_point)'');
end   

@everywhere  @inline function fitness(target, end_point)
    #Fitter is larger
    -euclidean(end_point, target)   
end

@everywhere function greedy_addition(LL::Matrix{Float64},
                         target::Vector{Float64},
                         best_word_set::Vector{Int},
                         max_additions = Inf)
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    
    did_improve = true
    cur_additions = 0
    while(did_improve && cur_additions<max_additions)
        cur_additions+=1
        did_improve=false
        
        addition_scores = score_possible_additions(LL, target, end_point)
        addition_score, addition = findmax(addition_scores)
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            best_word_set = [best_word_set...,addition]
            end_point += LL[:,addition]
            did_improve=true
        end
    end
    best_word_set,best_score
end


@everywhere function word_swap_refinement(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int})
    
    initial_word_set = copy(best_word_set)
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    
    function update_best!(word_set,score)
        if score>best_score+ϵ #scores are negative
            best_score=score
            best_word_set = word_set
            #println("*swap, new set: $word_set $score")
        end
    end

    for ii in 1:length(initial_word_set)
        word_set = initial_word_set[[1:ii-1; ii+1:end]]
        sub_endpoint = end_point - LL[:,ii]
        subset_score = fitness(LL, target, sub_endpoint)
        update_best!(word_set, subset_score)

        add_word_set, add_score = greedy_addition(LL, target, word_set, 1) #Try adding just one greedily
        update_best!(add_word_set, add_score)
    end

    best_word_set,best_score
        
end






function double_word_swap_refinement(LL::Matrix{Float64},
                              LL2::Matrix{Float64}, LL2_index,
                              target::Vector{Float64},
                              best_word_set::Vector{Int})
    
    initial_word_set = copy(best_word_set)
    initial_end_point = get_end(LL, initial_word_set)
    best_score = fitness(LL, target, best_word_set)
    
    for ii in 1:length(initial_word_set)
        for jj in ii+1:length(initial_word_set)
            @inbounds end_point = initial_end_point - LL[:,initial_word_set[ii]] - LL[:,initial_word_set[jj]]
            addition_score, addition_index = findmax(score_possible_additions(LL2, target, end_point))
            if addition_score>best_score + ϵ
                #println("+= $(LL2_index[addition_index,:])")
                @inbounds word_subset = initial_word_set[[1:ii-1; ii+1:jj-1; jj+1:end]]
                @inbounds best_word_set = [word_subset..., LL2_index[addition_index,:]...]
                best_score = addition_score
            end
        end
    end

    best_word_set,best_score
        
end

@everywhere function greedy_search(data::Dict, target::Vector{Float64}; rounds=1000, log=false)
    get_words(word_iis) = [data["indexed_words"][ii] for ii in word_iis]
    
    word_iis = Int[]
    best_score=-Inf
    for round in 1:rounds
        word_iis, add_score = greedy_addition(data["LL"], target, word_iis)
        @assert add_score >= best_score
        log && println("POST_ADD_STEP: $add_score $(get_words(word_iis))")
        
        if add_score>= 0.0 
            best_score = add_score
            break 
        end        
        
        word_iis, swap_score = word_swap_refinement(data["LL"], target, word_iis)
        @assert swap_score >= add_score
        log && println("POST_SWAP_STEP: $swap_score $(get_words(word_iis))")
        
        
        if swap_score==best_score || swap_score>=0.0
            break 
        end       
        
        best_score=swap_score
    end
    get_words(word_iis),score,word_iis
end



@everywhere function greedy_search{S<:AbstractString}(data::Dict, target_sent::Vector{S}; kwargs...)
    target::Vector{Float64} = lookup_sowe(data,target_sent)
    greedy_search(data, target, kwargs...)
end

In [None]:
length(LL2_index)

In [None]:

function get_LL2(LL)
    LL = data["LL"]
    len_LL2 = (size(LL,2).^2  + size(LL,2))÷ 2
    LL2_index = Matrix{Int}(len_LL2, 2)
    LL2 = Matrix{Float64}(size(LL,1), len_LL2)  .*0 .-1
    kk=0
    for ii in 1:size(LL,2)
        for jj in 1:ii
            kk+=1
            LL2_index[kk,:] = [ii,jj]
            LL2[:,kk] = LL[:,ii]+LL[:,jj]
        end
    end
    LL2, LL2_index
end


In [None]:
target_sent = test_set[664]
println(target_sent)
println("-------------")
best_words, best_score, best_wordset  = greedy_search(data, target_sent, 1,100, log=true)


In [None]:
target_sowe = lookup_sowe(data,target_sent)
@time new_best_words, new_best_score =double_word_swap_refinement(LL, LL2, LL2_index, target_sowe, best_wordset)


In [None]:
target_sent = test_set[12]
println(target_sent)
target_sowe = lookup_sowe(data,target_sent)

In [None]:
for ii in 1:10
    sample_sowe = (randn!(similar(target_sowe)) + target_sowe)
    print(greedy_search(data, sample_sowe, 1,100, log=false)[1])
    println(",")
end

In [None]:
res = pmap([1:length(test_set);], test_set) do ii,target_sent
    sol, score = greedy_search(data, target_sent, 1, 100, log=false)
    (sol, score, ii)
end

In [None]:
open("atis2_res_glove.jsz","w") do fh
    serialize(fh, (Vector{ASCIIString}[sol for (sol, score, ii) in res], test_set)
    )    
end



In [None]:
se = lookup_sowe(atis_data, ["flights","serve","lunch","which"])
se=se*0.6
greedy_search(atis_data, se, 1, 5, log=false)

In [None]:
hard_cases = find(x->x[2]<0, res)
res[hard_cases]

In [None]:
test_set[hard_cases]

In [None]:
hard_set =  test_set[hard_cases]

hard_res = pmap([1:length(hard_set);], hard_set) do ii,target_sent
    sol, score = greedy_search(atis_data, target_sent, 2,5, log=false)
    (sol, score, ii)
end

In [None]:
very_hard_cases = find(x->x[2]<0.0, hard_res)

In [None]:
factorial(14)

In [None]:
hard_res[very_hard_cases]

In [None]:


for (ii,target_sent) in enumerate(test_set)
    sol, score = greedy_search(atis_data, target_sent, log=false)
    if score>0
        print("$ii - ")
        println(join(target_sent, " "))
    end
end
    

In [None]:
enumerate(test_set) |> collect

In [None]:
`git push""` |> run

In [None]:
`git commit -m="Removed ants " -a` |> run