In [1]:
using Pipe

In [2]:
function pz(x :: AbstractArray)
    println(typeof(x), ": ", size(x))
end

pz (generic function with 1 method)

In [3]:
using PyCall
@pyimport nltk



In [4]:
function tokenize(sentence::String)
    convert(Array{String,1},nltk.word_tokenize(sentence))
end

tokenize (generic function with 1 method)

In [5]:
include("load_embeddings.jl")

load_embeddings (generic function with 1 method)

In [6]:
LL,word_index,indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
size(LL) |> println
word_index|> typeof |> println
indexed_words|> typeof |> println

(50,268810)
Dict{String,BitArray{1}}
Array{String,1}


In [31]:
typealias Embedding Vector{Float64}
typealias Embeddings Matrix{Float64}
typealias Words Union(AbstractArray{ASCIIString,1},AbstractArray{String,1})
type RAE
    L::Matrix{Float64}
    word_index::Dict{String,BitVector}
    indexed_words::Vector{String}
    
    W_e::Matrix{Float64}
    W_d::Matrix{Float64}
   
end


function RAE(L::Matrix{Float64}, word_index::Dict{String,BitVector}, indexed_words::Vector{String})
    emb_width = size(L,1)
    
    RAE(L,word_index, indexed_words,
        0.01*randn(emb_width,emb_width*2+1),
        0.01*randn(emb_width*2,emb_width+1) )
end

function eval_embedding(rae::RAE, input::String)
    k= word_index[ii]
    rae.L*k
end

function eval_embeddings(rae::RAE, inputs::Words)
    ks= @pipe inputs |> map(ii -> word_index[ii], _) |> hcat(_...)
    rae.L*ks
end

function eval_embeddings(rae::RAE, c_is::Embeddings, c_js::Embeddings)
    @assert size(c_is)==size(c_js)
    bias_in = ones(1,size([c_is], 2))
    tanh(rae.W_e*[c_is;c_js;bias_in])
end



eval_embeddings (generic function with 2 methods)

In [66]:
function eval_scores(rae::RAE, c_is::Embeddings, c_js::Embeddings)
    pps=eval_embeddings(rae, c_is, c_js)
    c_ijs = [c_is;c_js]
    bias_in = ones(1,size(pps, 2))
    ĉ_ijs = tanh(rae.W_d*[pps;bias_in])
    
    sum((c_ijs-ĉ_ijs).^2,1)
    
end


eval_scores (generic function with 1 method)

In [82]:
function word_adjancy_matrix(sentence_len::Int)
    #TODO: Replace this whole adjancy matrix with a linked list
    [(ii==jj+1) || (ii==jj-1)  for ii in 1:sentence_len, jj in 1:sentence_len]
end

function eval_to_tree(rr::RAE, sentence::AbstractArray{String})
    backlookup = sentence
    cs = eval_embeddings(rr, backlookup)
    AA = word_adjancy_matrix(size(cs,2))
    
    score_total = 0.0
    while(any(AA))
        iis, jjs = findn(AA)

        pps = eval_embeddings(rr, cs[:,iis],cs[:,jjs])
        scores = eval_scores(rr, cs[:,iis],cs[:,jjs])
        best_pair_ind = indmax(scores)
        
         
        ii_best, jj_best = sort([iis[best_pair_ind], jjs[best_pair_ind]])
        #The above makes a more readable output, but doesn't do anything useful,
        #it is same as below:
        #ii_best = iis[best_pair_ind]
        #jj_best = jjs[best_pair_ind]
        pp_best = pps[:,best_pair_ind]
        score_total+=scores[best_pair_ind]
        
        cs = [cs pp_best]
        backlookup = [backlookup, (backlookup[ii_best],backlookup[jj_best])]
        
        #Adjust Adjacency Matrix

        AA[ii_best,jj_best] = false
        AA[jj_best,ii_best] = false
        AA = [AA; AA[ii_best,:] | AA[jj_best,:]] # Add row
        AA = [AA AA[:,ii_best] | AA[:,jj_best]] # Add col
        AA[ii_best,:] = AA[jj_best,:] = false
        AA[:,ii_best] = AA[:,jj_best] = false #Remove anything that was adjacent to old

    end
    tree = backlookup[end]
    embedding = cs[:,end]
    tree, embedding, score_total
end


eval_to_tree (generic function with 1 method)

In [83]:
rr = RAE(LL,word_index,indexed_words);
sent = "the boy destroyed the house"
sent_toks = tokenize(sent)

eval_to_tree(rr,sent_toks)


((("the","boy"),("destroyed",("the","house"))),[0.000627286,0.00706757,0.00281409,0.0166301,-0.0055186,-0.00542435,-0.0273057,0.00879303,0.014203,-0.0140284  …  -0.013639,0.00516059,0.00343523,0.00877316,-0.00334319,-0.00906643,-0.0153371,-0.0159076,-0.00704809,0.010371],24.45741859809114)

In [81]:
cs = @pipe sent |> tokenize |> eval_embeddings(rr,_)
c_is = cs[:,1:end-1]
c_js = cs[:,2:end]

eval_scores(rr, c_is, c_js)

1x4 Array{Float64,2}:
 9.59159  5.36591  9.57028  12.0189

In [None]:
function unfold(rae::RAE, pps::Embedding)
    bias_in = ones(1,size(pp, 2))
    ĉ_ijs = tanh(rae.W_d*[pp;bias_in])
    
end

In [None]:
eval_embedding(rr, "house")