In [1]:
using FunctionalCollections
using Iterators
using Pipe
using Compat

@everywhere macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util/"                                                       

In [2]:
addprocs(11)

11-element Array{Int64,1}:
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12

In [3]:
@everywhere using JLD
@everywhere data = load("results/data/books/books300d_sym.jld","data_sym")
corpus_filename = "test_books_corpus_0.01_of_test"
test_set = open(deserialize, "results/data/books/"*corpus_filename*".jsz")



66464-element Array{Array{Symbol,1},1}:
 [:okay,symbol(","),:i,:said,symbol(","),:this,:is,:going,:to,:be,:some,:fun,:revenge,:.]                                                                               
 [:this,:is,:just,:great,symbol(","),:seth,:shouted,:.]                                                                                                                 
 [:start,:in,:the,:beginning,:and,:do,symbol("n't"),:hold,:anything,:back,:.]                                                                                           
 [:a,:lot,:of,:explanations,:.]                                                                                                                                         
 [:even,:if,:she,:really,:loved,:him,symbol(","),:noah,:would,:never  …  :with,:her,:past,:and,:his,:?,:his,:first,:kiss,:?]                                            
 [:a,:dilemma,:that,:never,:would,:have,:bothered,:her,:before,:.]                                                 

In [4]:
#@everywhere using Distances

In [5]:
@everywhere function lookup_sowe(data, sent)
    lookup_sowe(data, sent |> split)
end

@everywhere function lookup_sowe{S}(data, sent::Vector{S})
    sum([data["LL"][:,data["word_indexes"][word]] for word in sent]) 
end

@everywhere function lookup_words(data, path)
    ASCIIString[data["indexed_words"][ii] for ii in path]
end

@everywhere function lookup_indexes{S}(data, sent::Vector{S})
    Int[data["word_indexes"][word] for word in sent]
end

In [6]:
@everywhere const ϵ = 10.0^-5

In [7]:
@everywhere @inline function get_end(LL, ws::AbstractVector{Int})
    @inbounds sofar = length(ws)>0 ? sum([sub(LL,(:,ii)) for ii in ws]) : zeros(size(LL,1))
    sofar
end

@everywhere function score_possible_additions(LL, target, end_point)
    #-(sumabs(LL.+(end_point.-target),1)) #City Block
    -sqrt(sumabs2(LL.+(end_point.-target),1)) #Eculidean
    
end   

@everywhere  @inline function fitness(target, end_point)
    #Fitter is larger
    #-sumabs(end_point.-target) #city block
    -norm(end_point.-target) #euclidean
    
end

@everywhere function greedy_addition{F<:AbstractFloat}(LL::Matrix{F},
                         target::Vector{F},
                         initial_word_set::AbstractVector{Int},
                         max_additions = Inf)
    best_word_set = convert(Vector{Int},initial_word_set)
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    
    cur_additions = 0
    while(cur_additions<max_additions)   
        cur_additions+=1
        addition_scores = score_possible_additions(LL, target, end_point)
        addition_score, addition = findmax(addition_scores)
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            push!(best_word_set, addition)
            end_point += sub(LL,(:,addition))
        else 
            break
        end
    end
    best_word_set,best_score
end


@everywhere function word_swap_refinement{F<:AbstractFloat}(LL::Matrix{F},
                              target::Vector{F},
                              initial_word_set::AbstractVector{Int})
    
    best_word_set = copy(initial_word_set)
    end_point = get_end(LL, initial_word_set)
    best_score = fitness(target, end_point)
    function update_best!(word_set,score)
        if score>best_score+ϵ #scores are negative
            best_score=score
            best_word_set = word_set
            #println("*swap, new set: $word_set $score")
        end
    end
    n_words_initial = length(initial_word_set)
    for ii in 1:n_words_initial-1 #Don't need to consider last word added as it was added greedily
        word_set = sub(initial_word_set,[1:ii-1; ii+1:n_words_initial])
        sub_endpoint = end_point - sub(LL,(:,initial_word_set[ii]))
        subset_score = fitness(target, sub_endpoint)
        update_best!(word_set, subset_score)
        

        add_word_set, add_score = greedy_addition(LL, target, word_set, 1) #Try adding just one greedily
        update_best!(add_word_set, add_score)
    end

    best_word_set,best_score
        
end


@everywhere function greedy_search{F<:AbstractFloat}(data::Dict, target::Vector{F}; rounds=1000, log=false)
    get_words(word_iis) = [data["indexed_words"][ii] for ii in word_iis]
    
    word_iis = Int[]
    best_word_iis = word_iis
    best_score=-Inf
    for round in 1:rounds
        word_iis, add_score = greedy_addition(data["LL"], target, word_iis)
        log && println("POST_ADD_STEP: $add_score $(get_words(word_iis))")
        @assert add_score + ϵ >= best_score || best_word_iis == word_iis "$add_score vs $best_score $(get_words(word_iis))"
        best_word_iis = word_iis
        
        if add_score>= 0.0 
            best_score = add_score
            break 
        end        
        

        
        word_iis, swap_score = word_swap_refinement(data["LL"], target, word_iis)
        log && println("POST_SWAP_STEP: $swap_score $(get_words(word_iis))")
        @assert swap_score + ϵ >= add_score || best_word_iis == word_iis
        best_word_iis = word_iis
        
        converged = best_score - ϵ<swap_score<best_score + ϵ || swap_score>=-ϵ
        best_score=swap_score
        if converged
            break 
        end       
    end
    get_words(word_iis),best_score
end



@everywhere function greedy_search{S}(data::Dict, target_sent::Vector{S}; kwargs...)
    target = lookup_sowe(data,target_sent)
    greedy_search(data, target; kwargs...)
end

In [8]:
using Blocks
using Lumberjack
add_truck(LumberjackTruck("selection.log"), "file-logger")

  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
Base.FS is deprecated.
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3
  likely near /home/ubuntu/.julia/v0.5/Blocks/src/Blocks.jl:3


Lumberjack.LumberjackTruck(IOStream(<file selection.log>),nothing,Dict{Any,Any}(:is_colorized=>false,:uppercase=>false))

In [9]:
test_set_blocks = Block(test_set, 1, 1000)

Blocks.Block{Array{Array{Symbol,1},1}}([[:okay,symbol(","),:i,:said,symbol(","),:this,:is,:going,:to,:be,:some,:fun,:revenge,:.],[:this,:is,:just,:great,symbol(","),:seth,:shouted,:.],[:start,:in,:the,:beginning,:and,:do,symbol("n't"),:hold,:anything,:back,:.],[:a,:lot,:of,:explanations,:.],[:even,:if,:she,:really,:loved,:him,symbol(","),:noah,:would,:never  …  :with,:her,:past,:and,:his,:?,:his,:first,:kiss,:?],[:a,:dilemma,:that,:never,:would,:have,:bothered,:her,:before,:.],[:noah,:saw,:her,:disappearing,:down,:a,:hallway,:at,:the,:back  …  :her,:room,symbol(","),:which,:slammed,:shut,:in,:his,:face,:.],[symbol("``"),:thank,:god,:you,:did,:?,symbol("''")],[:feeling,:the,:awkwardness,symbol(","),:wara,:did,symbol("n't"),:respond,:.],[:i,symbol("'m"),:not,:going,:to,:leave,:you,:alone,:.,symbol("''")]  …  [:damn,:.],[:i,:picked,:up,:the,:sheet,:of,:paper,:i,:had,:jotted  …  :and,:gave,:it,:a,:quick,:rip,:down,:the,:middle,:.],[:his,:words,:caused,:my,:heart,:to,:pick,:up,:speed,:becau

In [10]:
function jldopen_append(func::Function, filename::AbstractString)
    mode = isfile(filename) ? "r+" : "w" #Only open with "w" if it does't already exist
    jldopen(func, filename, mode)
end

jldopen_append (generic function with 1 method)

In [11]:
const res_type = Tuple{Array{Symbol,1},Array{Symbol,1},Float32}
function run(test_set_blocks, save_path="selection.jld")
    try
        Lumberjack.info("Began selection")
        ii = 0 
        map(test_set_blocks) do block
            
            block_res::Vector{res_type} = pmap(block,err_stop=true) do target_sent
            #block_res::Vector{res_type} = map(block) do target_sent
                sol, score = greedy_search(data, target_sent, rounds=10_000, log=false)
                (target_sent, sol, score)
            end
            ii+=1
            avg_score = sum([r[3] for r in block_res])/length(block)
            Lumberjack.info("$ii done: $avg_score")
            jldopen_append(save_path) do fh
                write(fh,string(ii), block_res)
            end
            Lumberjack.debug("$ii written to disc")
        end
    catch err
        Lumberjack.error("Unhandled Error", base_exception=err)
    end
    Lumberjack.info("complete selection")
end

run (generic function with 2 methods)

In [None]:
err_oo =0
try
    run(test_set_blocks, "results/bags/$(corpus_filename).jld") 
catch err
    err_oo = err
end

2016-01-29T18:41:57 - info: Began selection


In [None]:
ans

In [146]:
eg_corpus = @pipe ["name this 1922 novel about leopold bloom written by james joyce",#* " .",
    "ralph waldo emerson dismissed this poet as the jingle man and james russell lowell called him three-fifths genius and two-fifths sheer fudge",# * " .",
    "this is the basis of a comedy of manners first performed in 1892",#*" .",
    "in a third novel a sailor abandons the patna and meets marlow who in another novel meets kurtz in the congo",
    "thus she leaves her husband and child for aleksei vronsky but all ends sadly when she leaps in front of a train",
    "we looked out at the setting sun .",
    " i went to the kitchen .",
    "how are you doing ?"
    ] |>map(split,_) #|> map(shuffle,_)

8-element Array{Array{SubString{ASCIIString},1},1}:
 SubString{ASCIIString}["name","this","1922","novel","about","leopold","bloom","written","by","james","joyce"]                                                                                         
 SubString{ASCIIString}["ralph","waldo","emerson","dismissed","this","poet","as","the","jingle","man"  …  "russell","lowell","called","him","three-fifths","genius","and","two-fifths","sheer","fudge"]
 SubString{ASCIIString}["this","is","the","basis","of","a","comedy","of","manners","first","performed","in","1892"]                                                                                    
 SubString{ASCIIString}["in","a","third","novel","a","sailor","abandons","the","patna","and"  …  "marlow","who","in","another","novel","meets","kurtz","in","the","congo"]                             
 SubString{ASCIIString}["thus","she","leaves","her","husband","and","child","for","aleksei","vronsky"  …  "ends","sadly","when","she","leaps","in","

In [19]:
eg_res = pmap(eg_corpus) do sent
    greedy_search(data, map(symbol,sent), rounds=10_000, log=true)
end

	From worker 9:	POST_ADD_STEP: -4.98164554219167 Any[:you,symbol("'re"),:do,:how,:besides]
	From worker 8:	POST_ADD_STEP: -1.2333925124245116e-6 Any[:i,:to,:the,:kitchen,:went,:.]
	From worker 7:	POST_ADD_STEP: -2.4465027206035164e-6 Any[:we,:at,:out,:sun,:the,:looked,:setting,:.]
	From worker 9:	POST_SWAP_STEP: -4.804646 Any[symbol("'re"),:do,:how,:besides,:?]
	From worker 9:	POST_ADD_STEP: -4.804646 Any[symbol("'re"),:do,:how,:besides,:?]
	From worker 8:	POST_SWAP_STEP: -1.0199695e-6 Any[:i,:to,:the,:kitchen,:went,:.]
	From worker 2:	POST_ADD_STEP: -2.3108506554848527e-6 Any[:written,:novel,:by,:james,:name,:leopold,:about,:bloom,symbol("1922"),:this,:joyce]
	From worker 9:	POST_SWAP_STEP: -4.4905496 Any[symbol("'re"),:do,:how,:?,:well]
	From worker 9:	POST_ADD_STEP: -4.490549 Any[symbol("'re"),:do,:how,:?,:well]
	From worker 7:	POST_SWAP_STEP: -2.8359793e-6 Any[:we,:at,:out,:sun,:the,:looked,:setting,:.]
	From worker 4:	POST_ADD_STEP: -4.21602260213538 Any[:this,:a,:of,:in,:comedy,:

8-element Array{Any,1}:
 (Any[:written,:novel,:by,:james,:name,:leopold,:about,:bloom,symbol("1922"),:this,:joyce],-2.4892306f-6)                                                                                                             
 (Any[:him,symbol("``"),:james,:was,:ralph,:genius,:known,:emerson,:lowell,:the  …  :__________________________________,:sheer,:and,:hapless,:aldrick,:_____________,:both,:this,:hirsute,symbol("2008")],-8.132799f0)
 (Any[:a,:of,:in,:comedy,:of,:first,:this,:performed,:basis,:manners,symbol("1892"),:the,:is],-3.704945f-6)                                                                                                           
 (Any[:a,:in,:the,:a,:novel,:meets,:in,:who,:in,:meets  …  :congo,:another,:marlow,:the,:abandons,:patna,:sailor,:kurtz,:and,:third],-5.4765537f-6)                                                                   
 (Any[:she,:her,:she,:all,:when,:child,:for,:leaves,:front,:and  …  :ends,:aleksei,:leaps,:of,:vronsky,:in,:sadly,:a

In [163]:
?replace

search: 

```
replace(string, pat, r[, n])
```

Search for the given pattern `pat`, and replace each occurrence with `r`. If `n` is provided, replace at most `n` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If `r` is a function, each occurrence is replaced with `r(s)` where `s` is the matched substring. If `pat` is a regular expression and `r` is a `SubstitutionString`, then capture group references in `r` are replaced with the corresponding matched text.


replace redisplay score_possible_additions



In [220]:
function prep_table(actual, reference)
    actual = map(symbol, actual)
    reference = map(symbol, reference)
    words = @pipe actual |> Set |> union(_,reference) |> map(string,_) |> sort |> map(symbol,_)
    actual_bow = counter(actual)
    reference_bow = counter(reference)
    
    table = @pipe ["$word & $(actual_bow[word]) & $(reference_bow[word])" 
                for word in words] |> join(_,"\\\\\n")
    "Word & O & R \\\\\n\\hline\n"*table
        
end

function escape(str)
    for n in 50:-1:5
        str = replace(str,"_"^n,"\\emph{\\_...\\_($n)}")
    end
    str    
end

function prep_doc_fragment(actual, reference, cap="")
    sentence = join(reference, " ")
    ret=""
    ret *= "\\subfloat[][$cap]{\n"
    ret*="\\begin{tabular}{ l c r }\n"
    
    ##ret*=sentence*"\n"
    #ret*="\\pgfplotstabletypeset[string type,font=\\footnotesize]{\n"
    ret*=prep_table(actual,reference) |> escape
    ret*="\n\\end{tabular}\n"
    ret*="}\n" 
end

prep_doc_fragment (generic function with 2 methods)

In [221]:
examples = map(ag->prep_doc_fragment(ag[1][1],ag[2]), zip(eg_res, eg_corpus))
#examples = [table for (ii,table) in enumerate(examples)]
print(join(examples, "\n")*"\n\\end{figure}\n\\end{document}")

\subfloat[][]{
\begin{tabular}{ l c r }
Word & O & R \\
\hline
1922 & 1 & 1\\
about & 1 & 1\\
bloom & 1 & 1\\
by & 1 & 1\\
james & 1 & 1\\
joyce & 1 & 1\\
leopold & 1 & 1\\
name & 1 & 1\\
novel & 1 & 1\\
this & 1 & 1\\
written & 1 & 1
\end{tabular}
}

\subfloat[][]{
\begin{tabular}{ l c r }
Word & O & R \\
\hline
2008 & 1 & 0\\
\emph{\_...\_(13)} & 1 & 0\\
\emph{\_...\_(34)} & 1 & 0\\
\emph{\_...\_(44)} & 1 & 0\\
`` & 1 & 0\\
aldrick & 1 & 0\\
and & 2 & 2\\
as & 0 & 1\\
both & 1 & 0\\
called & 0 & 1\\
dismissed & 1 & 1\\
emerson & 1 & 1\\
fudge & 1 & 1\\
genius & 1 & 1\\
hapless & 1 & 0\\
him & 1 & 1\\
hirsute & 1 & 0\\
james & 1 & 1\\
jingle & 1 & 1\\
known & 1 & 0\\
lowell & 1 & 1\\
man & 0 & 1\\
poet & 1 & 1\\
ralph & 1 & 1\\
russell & 1 & 1\\
sheer & 1 & 1\\
the & 1 & 1\\
this & 1 & 1\\
three-fifths & 1 & 1\\
two-fifths & 1 & 1\\
waldo & 1 & 1\\
was & 1 & 0
\end{tabular}
}

\subfloat[][]{
\begin{tabular}{ l c r }
Word & O & R \\
\hline
1892 & 1 & 1\\
a & 1 & 1\\
basis & 1 & 1\\
com

In [222]:
[join(eg," ") for eg in eg_corpus]

8-element Array{ByteString,1}:
 "name this 1922 novel about leopold bloom written by james joyce"                                                                             
 "ralph waldo emerson dismissed this poet as the jingle man and james russell lowell called him three-fifths genius and two-fifths sheer fudge"
 "this is the basis of a comedy of manners first performed in 1892"                                                                            
 "in a third novel a sailor abandons the patna and meets marlow who in another novel meets kurtz in the congo"                                 
 "thus she leaves her husband and child for aleksei vronsky but all ends sadly when she leaps in front of a train"                             
 "we looked out at the setting sun ."                                                                                                          
 "i went to the kitchen ."                                                                               

In [233]:
sqrt(sumabs2(lookup_sowe(data,[:do, symbol("'re")]) - lookup_sowe(data,[:doing, :are])))

6.105681f0

In [242]:
sqrt(sumabs2(lookup_sowe(data,[symbol("2008")])))

5.6503687f0

In [236]:
lookup_sowe(data,[:it, :was, :the, :best, :of, :times, symbol(","),:it, :was, :the, :worst, :of, :times,])

300-element Array{Float32,1}:
  -0.793585 
   1.26606  
   0.282562 
  -2.06237  
   0.358368 
  -0.187004 
  -1.55621  
   1.83744  
  -0.15024  
 -21.5628   
   2.68219  
   1.36028  
  -0.66929  
   ⋮        
  -0.170959 
   0.829028 
  -0.0764151
  -3.52772  
   0.051009 
   1.75085  
   3.31713  
   3.76385  
  -1.16536  
  -1.90704  
  -1.97162  
  -0.293317 

In [15]:
data_brown = load("results/data/brown_glove50d.jld")

Dict{ByteString,Any} with 6 entries:
  "zeroed_words"  => ASCIIString[]
  "LL"            => 50x40482 Array{Float64,2}:…
  "unknown_words" => UTF8String["\$2.82","54,320","eventshah-leh","wycoff's","m…
  "indexed_words" => AbstractString["the",",",".","of","to","and","in","a","for…
  "corpus"        => [ASCIIString["``","only","a","relative","handful","of","su…
  "word_indexes"  => Dict{AbstractString,Int64}("baleful"=>34074,"confined"=>77…

In [17]:
data_brown["corpus"] |> length

42004

In [None]:
res = pmap(test_set[1:20], err_stop=true) do target_sent
#res = map(test_set) do target_sent
    target_sent=map(string, target_sent)
    sol, score = greedy_search(data, target_sent, rounds=10_000, log=false)
    (target_sent, sol, score)
end

In [None]:
using JLD
@save "results/bags/books300d.jld" res

In [None]:
res[1] |> typeof |> typeof

In [None]:
target_sent = test_set[67]
target_sowe = lookup_sowe(data,target_sent)
println(target_sent)
println("-------------")
best_words, best_score  = greedy_search(data, target_sent, log=true)

In [None]:
#########
# Experimental
#####

In [None]:
methodswith(pset)

In [None]:
a=push(pset(),3)
union(a, 5,4) |> length


In [None]:
function word_delete_refinement(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int},
                              max_remove = 2
        )
    
    initial_word_set = pset(best_word_set)
    initial_end_point = get_end(LL, best_word_set)
    best_score = fitness(target, initial_end_point)
    
    function inner(removed, endpoint)
        #println(removed)
        if length(removed)>max_remove
            return
        end
        remaining =  setdiff(initial_word_set, removed)
        
        for del_word in remaining
            new_endpoint = endpoint - sub(LL,(:,del_word))
            del_score = fitness(target, new_endpoint)
            
            if del_score>best_score + ϵ
                @inbounds best_word_set = remaining
                best_score = del_score
            end
            
            
            inner(push(removed, del_word), endpoint)
        end
        
    end
    
    inner(pset{Int}(),initial_end_point)

    best_word_set,best_score
        
end

In [None]:
before_words = ["not","after","said","election","to","the",",","calls","williams","due","bill", "anonymous",",","subjected","opens","consisted","snodgrass",".","folks","meanwhile","truculent","was","soon", "he"]
before_word_iis = lookup_indexes(data, before_words)
fitness(target_sowe, get_end(data["LL"], before_word_iis))
word_delete_refinement(data["LL"], target_sowe, before_word_iis, 2)



In [None]:
fitness(target_sowe, get_end(data["LL"], before_word_iis))

In [None]:
# Shorter Longer Refinement, for replaceing n words with m other words
# Either the words in our current set are too short, so we can limit our search to words that are longer than the sum of our shortest
# Or too long then we do the reverse

#We are removing something, and planning to get closer to the path by adding something
#things longer by adding a finite number more elements
function make_longer_refinement2(LL::Matrix{Float64},
                              target::Vector{Float64},
                              best_word_set::Vector{Int},
                              remove_indexes::Vector{Int},
                              m_add::Int)
    @assert(m_add>=0)
    
    rem_indexes = trues(best_word_set)
    rem_indexes[remove_indexes] = false
    
    cur_word_set = best_word_set[rem_indexes]
    cur_end_point = get_end(LL, cur_word_set)

    cur_lengths = -score_possible_additions(LL[:,best_word_set], target, get_end(LL, Int[])) |> vec

    length_removed = -fitness(target, get_end(LL,best_word_set)) -fitness(target, cur_end_point)
        
    @printval length_removed
    LL_scores = -score_possible_additions(m_add*LL, target, get_end(LL, Int[]))
    
    valid_replacers = find((m_add-1)*maximum(LL_scores) + LL_scores.>=length_removed)   
    valid_replacers
end



In [None]:
make_longer_refinement2(data["LL"], target_sowe, best_wordset, [1:39;], 2)

In [None]:
data["indexed_words"][40482]

In [None]:
342/3

In [None]:
LLscores = -score_possible_additions(3*data["LL"], target_sowe, get_end(data["LL"], Int[]))
find(LLscores.<120)

In [None]:
@everywhere function greedy_addition_with_backtrack_prep(LL::Matrix{Float64},
                         target::Vector{Float64},
                         best_word_set::Vector{Int},
                         keep_per_word = 50,
                         max_additions = Inf, 
                         
    )
    
    end_point = get_end(LL, best_word_set)
    best_score = fitness(target, end_point)
    good_ideas = Set{Int}()
    sizehint!(good_ideas, 10*keep_per_word) #Most sentences in most corpora have at least 10 words
    did_improve = true
    cur_additions = 0
    while(did_improve && cur_additions<max_additions)
        
        cur_additions+=1
        did_improve=false
        
        addition_scores = score_possible_additions(LL, target, end_point)
        
        high_scores = select!(collect(enumerate(addition_scores)), 1:keep_per_word, by=ii_score->ii_score[2])
        
        union!(good_ideas, map(ii_score->ii_score[1],high_scores))
        
        addition,addition_score = high_scores[1]
        if addition_score>best_score+ϵ
            #println("!add: $addition $best_score")
            best_score=addition_score
            best_word_set = [best_word_set...,addition]
            end_point += LL[:,addition]
            did_improve=true
        end
    end
    best_word_set,best_score,good_ideas
end

In [None]:
best_word_set,best_score,good_ideas  = greedy_addition_with_backtrack_prep(data["LL"], target_sowe, Int[], )

In [None]:
max

In [None]:
#@pipe good_ideas |> map(println,_)
length(good_ideas)

In [None]:
word_delete_refinement(data["LL"], target_sowe, best_wordset)

In [None]:
length(best_wordset)

In [None]:
a

In [None]:
function double_word_swap_refinement(LL::Matrix{Float64},
                              LL2::Matrix{Float64}, LL2_index,
                              target::Vector{Float64},
                              best_word_set::Vector{Int})
    
    initial_word_set = copy(best_word_set)
    initial_end_point = get_end(LL, initial_word_set)
    best_score = fitness(LL, target, best_word_set)
    
    for ii in 1:length(initial_word_set)
        for jj in ii+1:length(initial_word_set)
            @inbounds end_point = initial_end_point - LL[:,initial_word_set[ii]] - LL[:,initial_word_set[jj]]
            addition_score, addition_index = findmax(score_possible_additions(LL2, target, end_point))
            if addition_score>best_score + ϵ
                #println("+= $(LL2_index[addition_index,:])")
                @inbounds word_subset = initial_word_set[[1:ii-1; ii+1:jj-1; jj+1:end]]
                @inbounds best_word_set = [word_subset..., LL2_index[addition_index,:]...]
                best_score = addition_score
            end
        end
    end

    best_word_set,best_score
        
end

function get_LL2(LL)
    len_LL2 = (size(LL,2).^2  + size(LL,2))÷ 2
    LL2_index = Matrix{Int}(len_LL2, 2)
    @printval(len_LL2)
    LL2 = Matrix{Float64}(size(LL,1), len_LL2)
    kk=0
    for ii in 1:size(LL,2)
        for jj in 1:ii
            kk+=1
            LL2_index[kk,:] = [ii,jj]
            LL2[:,kk] = LL[:,ii]+LL[:,jj]
        end
    end
    LL2, LL2_index
end


In [None]:
LL2=get_LL2(data["LL"])

In [None]:
data["LL"]

In [None]:
819416403*50*2/ 1000_000_000

In [None]:
target_sowe = lookup_sowe(data,target_sent)
@time new_best_words, new_best_score =double_word_swap_refinement(LL, LL2, LL2_index, target_sowe, best_wordset)


In [None]:
target_sent = test_set[12]
println(target_sent)
target_sowe = lookup_sowe(data,target_sent)

In [None]:
for ii in 1:10
    sample_sowe = (randn!(similar(target_sowe)) + target_sowe)
    print(greedy_search(data, sample_sowe, 1,100, log=false)[1])
    println(",")
end

In [None]:
res = pmap([1:length(test_set);], test_set) do ii,target_sent
    sol, score = greedy_search(data, target_sent, 1, 100, log=false)
    (sol, score, ii)
end

In [None]:
open("atis2_res_glove.jsz","w") do fh
    serialize(fh, (Vector{ASCIIString}[sol for (sol, score, ii) in res], test_set)
    )    
end



In [None]:
se = lookup_sowe(atis_data, ["flights","serve","lunch","which"])
se=se*0.6
greedy_search(atis_data, se, 1, 5, log=false)

In [None]:
hard_cases = find(x->x[2]<0, res)
res[hard_cases]

In [None]:
test_set[hard_cases]

In [None]:
hard_set =  test_set[hard_cases]

hard_res = pmap([1:length(hard_set);], hard_set) do ii,target_sent
    sol, score = greedy_search(atis_data, target_sent, 2,5, log=false)
    (sol, score, ii)
end

In [None]:
very_hard_cases = find(x->x[2]<0.0, hard_res)

In [None]:
factorial(14)

In [None]:
hard_res[very_hard_cases]

In [None]:


for (ii,target_sent) in enumerate(test_set)
    sol, score = greedy_search(atis_data, target_sent, log=false)
    if score>0
        print("$ii - ")
        println(join(target_sent, " "))
    end
end
    

In [None]:
enumerate(test_set) |> collect