In [38]:
ENV["LINES"] = 30
ENV["COLUMNS"] = 300

300

In [3]:

using Pipe

In [4]:
function pz(x :: AbstractArray)
    println(typeof(x), ": ", size(x))
end

pz (generic function with 1 method)

In [5]:
using PyCall
@pyimport nltk



In [6]:
function tokenize(sentence::String)
    convert(Array{String,1},nltk.word_tokenize(sentence))
end

tokenize (generic function with 1 method)

In [7]:
include("load_embeddings.jl")

load_embeddings (generic function with 1 method)

In [8]:
LL,word_index,indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
size(LL) |> println
word_index|> typeof |> println
indexed_words|> typeof |> println

(50,268810)
Dict{String,BitArray{1}}
Array{String,1}


In [9]:
typealias Embedding Vector{Float64}
typealias Embeddings Matrix{Float64}
typealias Words Union(AbstractArray{ASCIIString,1},AbstractArray{String,1})
type RAE
    L::Matrix{Float64}
    word_index::Dict{String,BitVector}
    indexed_words::Vector{String}
    
    W_e::Matrix{Float64}
    W_d::Matrix{Float64}
   
end


function RAE(L::Matrix{Float64}, word_index::Dict{String,BitVector}, indexed_words::Vector{String})
    emb_width = size(L,1)
    
    W_e =0.01*randn(emb_width,emb_width*2+1) 
    #W_d = 0.01*randn(emb_width*2,emb_width+1)
    W_d = [pinv(W_e)[1:end-1,:] zeros(size(pinv(W_e),1)-1) ] #Cheat (Actually why can't I always do this to initialize?);
    RAE(L,word_index, indexed_words, W_e, W_d)
end

function eval_embedding(rae::RAE, input::String)
    k= word_index[ii]
    rae.L*k
end

function eval_embeddings(rae::RAE, inputs::Words)
    ks= @pipe inputs |> map(ii -> word_index[ii], _) |> hcat(_...)
    rae.L*ks
end

function eval_embeddings(rae::RAE, c_is::Embeddings, c_js::Embeddings)
    @assert size(c_is)==size(c_js)
    bias_in = ones(1,size([c_is], 2))
    tanh(rae.W_e*[c_is;c_js;bias_in])
end



eval_embeddings (generic function with 2 methods)

In [10]:
function eval_scores(rae::RAE, c_is::Embeddings, c_js::Embeddings)
    pps=eval_embeddings(rae, c_is, c_js)
    c_ijs = [c_is;c_js]
    bias_in = ones(1,size(pps, 2))
    ĉ_ijs = tanh(rae.W_d*[pps;bias_in])
    
    sum((c_ijs-ĉ_ijs).^2,1)
end


eval_scores (generic function with 1 method)

In [148]:
function eval_to_tree(rr::RAE,sentence::String)
    eval_to_tree(rr, tokenize(sentence))
end

function eval_to_tree(rr::RAE, sentence::Words)
    tree = tuple(sentence...)
    cs = eval_embeddings(rr, sentence)
    score_total = 0.0
    while(size(cs,2)>1)
        c_is = cs[:, 1:end-1]
        c_js = cs[:, 2:end]
        
        pps = eval_embeddings(rr, c_is, c_js)
        scores = eval_scores(rr, c_is, c_js)
        im = indmax(scores)
        
        score_total+=scores[im]
         
        cs = [cs[:,1:im-1] pps[:,im] cs[:,im+2:end]]
        tree = tuple(tree[1:im-1]..., (tree[im], tree[im+1]), tree[im+2:end]...)
    end
    tree = tree[1] #The final step creates a tuple containing one element, as first and last parts are empty
    embedding = cs[:]
    tree, embedding, score_total
end


eval_to_tree (generic function with 2 methods)

In [164]:
rr = RAE(LL,word_index,indexed_words);

sent = "the boy destroyed the house"
sent_toks = tokenize(sent)

tree, pp, score_total = eval_to_tree(rr,sent_toks)


((("the","boy"),("destroyed",("the","house"))),[0.0214169,-0.00536514,0.00615191,-0.00153357,0.0158808,-0.00435284,-0.00463049,0.00217613,-0.00856859,0.0171683  …  0.000850085,0.00711771,0.00133812,0.0036469,0.00375145,0.0124213,-0.00547713,0.0223813,0.00619695,-0.00629083],12.581912730517956)

In [151]:
#tree data in tree is not use, other than it's structure.
#((("the","house"),("destroyed",("the","boy")))  is equivalent to ((("",""),("",("",""))) 

function reconstruct(rae::RAE, pp::Embedding)
    bias_in = 1
    ĉ_ijs = tanh(rae.W_d*[pp;bias_in])
    ĉ_is = ĉ_ijs[1:end/2]
    ĉ_js = ĉ_ijs[end/2+1:end]
    ĉ_is, ĉ_js
end

function unfold(rae::RAE, tree::(String,String), pp::Embedding)
    ĉ_is, ĉ_js = reconstruct(rae, pp)
    [ĉ_is ĉ_js]
end


function unfold(rae::RAE, tree::(Any,String), pp::Embedding)
    p̂_is, ĉ_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    [ĉ_is ĉ_js]
end

function unfold(rae::RAE, tree::(String,Any), pp::Embedding)
    ĉ_is, p̂_js = reconstruct(rae, pp)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
    
end

function unfold(rae::RAE, tree::(Any,Any), pp::Embedding)
    p̂_is, p̂_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
end

    

unfold (generic function with 4 methods)

In [167]:
tree, pp, score_total = eval_to_tree(rr,"killer cows")
ĉs = unfold(rr,tree,pp)

show_bests(rr, ĉs)


20x4 Array{Any,2}:
 "killer"       0.853416  "cows"         0.693514
 "elder"        0.685059  "wax"          0.669501
 "MRTA"         0.656065  "gloves"       0.665464
 "PRD"          0.652857  "smog"         0.644174
 "young"        0.636397  "trees"        0.641096
 "female"       0.635095  "ink"          0.630874
 "killers"      0.629722  "helmets"      0.627258
 "pilot"        0.615603  "vitamins"     0.621529
 "veteran"      0.614612  "insects"      0.613549
 "prestigious"  0.611901  "allergies"    0.607636
 "UVF"          0.609147  "motorcycles"  0.607245
 "animated"     0.608794  "plutonium"    0.603155
 "academic"     0.606901  "clothes"      0.602247
 "Sandinista"   0.606661  "rigidity"     0.601668
 "sect"         0.603418  "addicts"      0.600928
 "flamboyant"   0.602693  "beard"        0.598731
 "PRI"          0.601258  "wind"         0.597801
 "child"        0.6005    "condoms"      0.597552
 "CIA"          0.600131  "panties"      0.592773
 "charismatic"  0.599295  "spec

In [16]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


function show_best(rae::RAE,ĉ::Embedding, nbest=20)
    candidates=neighbour_dists(ĉ,rae.L)   
    best_cands = [ (findfirst(candidates,score), score)
                    for score in select(candidates,1:nbest, rev=true)[1:nbest]]
    vcat([[rae.indexed_words[ii] score] for (ii,score) in best_cands]...)
end

function show_bests(rae::RAE,ĉs::Embeddings, nbest=20)
    hcat([show_best(rae,ĉs[:,ii],nbest) for ii in 1:size(ĉs,2)]...)
end


show_bests (generic function with 2 methods)

In [161]:
function depth_inc(ele::(Int,String))
    (ele[1]+1,ele[2])
end

function unfold_struct(tree::(Any,Any))
    left_tree = unfold_struct(tree[1]) 
    left = @pipe left_tree |> map(depth_inc,_)
    right_tree = unfold_struct(tree[2]) 
    right = @pipe right_tree |> map(depth_inc,_)
    [left, right, (0,"")]
end

function unfold_struct(tree::(Any,String))
    left_tree = unfold_struct(tree[1]) 
    left = @pipe left_tree |> map(depth_inc,_)
    [left, (0,tree[2]), (0,"")]
end
function unfold_struct(tree::(String,Any))
    right_tree = unfold_struct(tree[2]) 
    right = @pipe right_tree |> map(depth_inc,_)
    [(0,tree[1]),right, (0,"")]
end
function unfold_struct(tree::(String,String))
    [(0,tree[1]), (0, tree[2]), (0,"")]
end

function print_tree(tree::(Any,Any))
    
    for (depth,word ) in unfold_struct(tree)
        println("\t"^depth, word)
    end
end

print_tree (generic function with 1 method)

In [149]:
print_tree(tree)

	the
	house
	
	destroyed
		the
		boy
		
	



In [46]:
using DataStructures

In [48]:
aa=list([1,2,3,4])

list([1,2,3,4])

In [49]:
aa[2]

LoadError: `getindex` has no method matching getindex(::Cons{Array{Int64,1}}, ::Int64)
while loading In[49], in expression starting on line 1

In [None]:
function word_adjancy_matrix(sentence_len::Int)
    #TODO: Replace this whole adjancy matrix with a linked list
    [(ii==jj+1) || (ii==jj-1)  for ii in 1:sentence_len, jj in 1:sentence_len]
end


function eval_to_tree(rr::RAE,sentence::String)
    eval_to_tree(rr, tokenize(sentence))
end

function eval_to_tree(rr::RAE, sentence::Words)
    backlookup = sentence
    cs = eval_embeddings(rr, backlookup)
    AA = word_adjancy_matrix(size(cs,2))
    
    score_total = 0.0
    while(any(AA))
        iis, jjs = findn(AA)

        pps = eval_embeddings(rr, cs[:,iis],cs[:,jjs])
        scores = eval_scores(rr, cs[:,iis],cs[:,jjs])
        best_pair_ind = indmax(scores)
        

        ii_best = iis[best_pair_ind]
        jj_best = jjs[best_pair_ind]
        pp_best = pps[:,best_pair_ind]
        score_total+=scores[best_pair_ind]
        
        cs = [cs pp_best]
        backlookup = [backlookup, (backlookup[ii_best],backlookup[jj_best])]
        
        #Adjust Adjacency Matrix

        AA[ii_best,jj_best] = false
        AA[jj_best,ii_best] = false
        AA = [AA; AA[ii_best,:] | AA[jj_best,:]] # Add row
        AA = [AA AA[:,ii_best] | AA[:,jj_best]] # Add col
        AA[ii_best,:] = AA[jj_best,:] = false
        AA[:,ii_best] = AA[:,jj_best] = false #Remove anything that was adjacent to old

    end
    tree = backlookup[end]
    embedding = cs[:,end]
    tree, embedding, score_total
end
