In [1]:
ENV["LINES"] = 30
ENV["COLUMNS"] = 300

300

In [2]:
using Pipe
function pz(x :: AbstractArray)
    println(typeof(x), ": ", size(x))
end

pz (generic function with 1 method)

In [3]:
using PyCall
@pyimport nltk
function tokenize(sentence::String)
    convert(Array{String,1},nltk.word_tokenize(sentence))
end



tokenize (generic function with 1 method)

In [4]:
include("load_embeddings.jl")

load_embeddings (generic function with 1 method)

In [5]:
LL,word_indexes, indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
size(LL) |> println
word_indexes |> typeof |> println
indexed_words |> typeof |> println

(50,268810)
Dict{String,Int64}
Array{String,1}


In [48]:
typealias Embedding Vector{Float64}
typealias Embeddings Matrix{Float64}
typealias Words Union(AbstractArray{ASCIIString,1},AbstractArray{String,1})
type RAE
    L::Matrix{Float64}
    word_index::Dict{String,Int}
    indexed_words::Vector{String}
    
    W_e::Matrix{Float64}
    b_e::Vector{Float64}
    W_d::Matrix{Float64}
    b_d::Vector{Float64}
   
end


function RAE(L::Matrix{Float64},word_index::Dict{String,Int}, indexed_words::Vector{String})
    emb_width = size(L,1)
    
    W_e =0.01*randn(emb_width,emb_width*2) 
    b_e = 0.01*randn(emb_width) 
    #W_d = 0.01*randn(emb_width*2,emb_width)
    W_d = pinv(W_e) #Cheat (Actually why can't I always do this to initialize?);
    b_d = 0.01*randn(emb_width*2)
    
    RAE(L,word_index, indexed_words, W_e, b_e, W_d, b_d)
end


function get_word_index(rae::RAE, input::String, show_warn=true)
    if haskey(rae.word_index, input)
        ii = rae.word_index[input]
    elseif haskey(rae.word_index, lowercase(input))
        ii = rae.word_index[lowercase(input)]
    else
        ii = rae.word_index["*UNKNOWN*"]
        if show_warn
            println("$input not found. Defaulting.")
        end
    end
    ii
end


function eval_word_embedding(rae::RAE, input::String, show_warn=true)
    k=get_word_index(rae, input, show_warn)
    rae.L[:,k]
end

function eval_word_embeddings(rae::RAE, inputs::Words, show_warn=false)
    ks = @pipe inputs |> map(ii -> get_word_index(rae,ii, show_warn), _)
    rae.L[:,ks]
end




function eval_merges(rae::RAE, c_is::Embeddings, c_js::Embeddings)
    @assert size(c_is)==size(c_js)

    tanh(rae.W_e*[c_is;c_js].+rae.b_e)
end

function reconstruct(rae::RAE, pp::Embedding)
    ĉ_ijs = tanh(rae.W_d*pp+rae.b_d)
    ĉ_is = ĉ_ijs[1:end/2]
    ĉ_js = ĉ_ijs[end/2+1:end]
    ĉ_is, ĉ_js
end



function unfold_merges(rae::RAE, pps::Embeddings)
    ĉ_ijs = tanh(rae.W_d*pps .+ rae.b_d)
end

unfold_merges (generic function with 1 method)

In [56]:
function eval_scores(rae::RAE, c_is::Embeddings, c_js::Embeddings,
                     pps=eval_merges(rae, c_is, c_js))
    c_ijs = [c_is;c_js]
    ĉ_ijs = unfold_merges(rae,pps)
    
    1/2*sum((c_ijs-ĉ_ijs).^2,1)
end
#A better scoring function is given in SorcherRAE (eaquation 6)


function eval_scores_gradient(rae::RAE, 
                              ĉ_ijs::Embeddings,
                              pps::Embeddings,
                              c_ijs::Embeddings)
    #http://neuralnetworksanddeeplearning.com/chap2.html
    
    da = (c_ijs - ĉ_ijs)
    dz_d = (1-ĉ_ijs.^2)
    δ_d = da.*dz_d
    
    #loop below if deep    
    dz_e = (1-pps.^2)
    δ_e = (rae.W_d'*δ_d).*dz_e 
    
    
    ∇W_d = δ_d*pps'
    ∇b_d = δ_d
    ∇W_e = δ_e*c_ijs'
    ∇b_e = δ_e
    
    
    ∇W_e, ∇b_e, ∇W_d, ∇b_d

end



eval_scores_gradient (generic function with 1 method)

In [57]:
rr = RAE(LL,word_indexes,indexed_words);
c_1 = eval_word_embedding(rr,"killer")''
c_2 = eval_word_embedding(rr, "cows")''
c_12 = [c_1; c_2]
p_12=eval_merges(rr, c_1'', c_2'')
ĉ_12 = unfold_merges(rr,p_12)
#eval_scores_gradient(rr, ĉ_12, p_12, c_12)


(
50x100 Array{Float64,2}:
 -0.358534     -0.828338    -1.6629      -0.266672     -0.285667     2.52346      0.952676     2.293        2.17838      1.23577      1.33082     -0.832697    …   0.141022      0.678446    -1.15244      0.246818     -1.18257      0.0400143    -0.169281     -0.0633738     0.0439598    -1.25544   
 -0.228286     -0.52742     -1.0588      -0.169795     -0.18189      1.60674      0.606588     1.46         1.38702      0.78684      0.847358    -0.530195        0.0897913     0.43198     -0.73378      0.157154     -0.752967     0.0254779    -0.107785     -0.0403514     0.0279901    -0.799362  
  0.00643422    0.0148653    0.0298423    0.00478567    0.00512655  -0.0452858   -0.0170966   -0.0411501   -0.039093    -0.022177    -0.0238827    0.0149435      -0.00253076   -0.0121753    0.0206815   -0.00442937    0.0212223   -0.000718094   0.0030379     0.0011373    -0.0007889     0.02253   
  0.0151909     0.0350962    0.0704563    0.0112987     0.0121035   -0.106918    -

In [50]:
function eval_to_tree(rr::RAE,sentence::String)
    eval_to_tree(rr, tokenize(sentence))
end

function eval_to_tree(rr::RAE, sentence::Words)
    tree = tuple(sentence...)
    cs = eval_word_embeddings(rr, sentence)
    score_total = 0.0
    while(size(cs,2)>1)
        c_is = cs[:, 1:end-1]
        c_js = cs[:, 2:end]
        
        pps = eval_merges(rr, c_is, c_js)
        scores = eval_scores(rr, c_is, c_js, pps)
        im = indmax(scores)
        
        score_total+=scores[im]
         
        cs = [cs[:,1:im-1] pps[:,im] cs[:,im+2:end]]
        tree = tuple(tree[1:im-1]..., (tree[im], tree[im+1]), tree[im+2:end]...)
    end
    tree = tree[1] #The final step creates a tuple containing one element, as first and last parts are empty
    embedding = cs[:]
    tree, embedding, score_total
end


eval_to_tree (generic function with 2 methods)

In [51]:
rr = RAE(LL,word_indexes,indexed_words);

sent = "the boy destroyed the house"
sent_toks = tokenize(sent)

tree, pp, score_total = eval_to_tree(rr,sent_toks)

(((("the","boy"),"destroyed"),("the","house")),[-0.00703771,0.0261892,0.0165252,0.0143384,-0.0128244,0.0052343,0.00878562,0.00292834,-0.0146886,-0.0065181  …  -0.00387706,-0.000136323,0.0131976,0.00165049,0.0189463,-0.00262567,0.0107307,0.0139779,-0.0053516,0.0120967],8.204136328437572)

In [19]:
using Iterators
@pyimport nltk.corpus as nltk_corpus
n_training = 5
training_sents = @pipe nltk_corpus.brown[:sents]() |> take(_,n_training)  |> collect |> convert(Vector{Vector{String}},_);


In [None]:
using Optim #https://github.com/JuliaOpt/Optim.jl

rae_outer = RAE(LL,word_indexes,indexed_words);

function unpack!(rae::RAE, θ::Vector)
    W_e_len = length(rae.W_e)
    b_e_len = length(rae.b_e)
    W_d_len = length(rae.W_d)
    b_d_len = length(rae.b_d)
    W_e_shape = size(rae.W_e)
    W_d_shape = size(rae.W_d)
    
    rae.W_e = reshape(θ[1: W_e_len],W_e_shape)
    rae.b_e = θ[W_e_len+1: b_e_len]
    rae.W_d = reshape(θ[W_e_len+b_e_len+1: W_e_len+b_e_len+W_d_len],W_d_shape)
    rae.b_d = θ[W_e_len+b_e_len+W_d_len+1: end]
    
    rae
end

function pack(rae::RAE)
    [rae.W_e[:],rae.b_e[:], rae.W_d[:],rae.b_d[:],] 
end

#--------------------------------------------------------

function loss(θ::Vector)  
    rae = unpack!(rae_outer, θ)
    err =@pipe training_sents |> map( ss-> eval_to_tree(rae, ss)[3], _) |> mean
    err
end

function loss_grad(θ::Vector, storage::Vector)   
    rae = unpack!(rae_outer, θ)
    err =@pipe training_sents |> map( ss-> eval_to_tree(rae, ss)[3], _) |> mean
    err
end

function loss_grad_and_loss_grad(θ::Vector, storage::Vector)   
    rae = unpack!(rae_outer, θ)
    err =@pipe training_sents |> map( ss-> eval_to_tree(rae, ss)[3], _) |> mean
    println("**", err)
    err
end

f=DifferentiableFunction(loss,loss_grad!,loss_and_loss_grad!)
#Must provide Graident as finite difference requires ~length(θ) calls to f
res = optimize(f, pack(rae_outer), method=:l_bfgs, show_trace = true, store_trace=true, iterations = 10)
println(res)
rae_outer = unpack!(rae_outer, res.minimum)

https://github.com/JuliaLang/julia/blob/master/doc/manual/profile.rst Actual instructions


In [None]:
Profile.clear()
@profile f(pack(rae_outer))


In [None]:
using ProfileView
ProfileView.view()

In [None]:
#tree data in tree is not use, other than it's structure.
#((("the","house"),("destroyed",("the","boy")))  is equivalent to ((("",""),("",("",""))) 



function unfold(rae::RAE, tree::(String,String), pp::Embedding)
    ĉ_is, ĉ_js = reconstruct(rae, pp)
    [ĉ_is ĉ_js]
end


function unfold(rae::RAE, tree::(Any,String), pp::Embedding)
    p̂_is, ĉ_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    [ĉ_is ĉ_js]
end

function unfold(rae::RAE, tree::(String,Any), pp::Embedding)
    ĉ_is, p̂_js = reconstruct(rae, pp)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
    
end

function unfold(rae::RAE, tree::(Any,Any), pp::Embedding)
    p̂_is, p̂_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
end

    

In [None]:
tree, pp, score_total = eval_to_tree(rr,"killer cows")
ĉs = unfold(rr,tree,pp)

show_bests(rr, ĉs)


In [None]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


function show_best(rae::RAE,ĉ::Embedding, nbest=20)
    candidates=neighbour_dists(ĉ,rae.L)   
    best_cands = [ (findfirst(candidates,score), score)
                    for score in select(candidates,1:nbest, rev=true)[1:nbest]]
    vcat([[rae.indexed_words[ii] score] for (ii,score) in best_cands]...)
end

function show_bests(rae::RAE,ĉs::Embeddings, nbest=20)
    hcat([show_best(rae,ĉs[:,ii],nbest) for ii in 1:size(ĉs,2)]...)
end


In [None]:
function depth_inc(ele::(Int,String))
    (ele[1]+1,ele[2])
end

function unfold_struct(tree::(Any,Any))
    left_tree = unfold_struct(tree[1]) 
    left = @pipe left_tree |> map(depth_inc,_)
    right_tree = unfold_struct(tree[2]) 
    right = @pipe right_tree |> map(depth_inc,_)
    [left, right, (0,"")]
end

function unfold_struct(tree::(Any,String))
    left_tree = unfold_struct(tree[1]) 
    left = @pipe left_tree |> map(depth_inc,_)
    [left, (0,tree[2]), (0,"")]
end
function unfold_struct(tree::(String,Any))
    right_tree = unfold_struct(tree[2]) 
    right = @pipe right_tree |> map(depth_inc,_)
    [(0,tree[1]),right, (0,"")]
end
function unfold_struct(tree::(String,String))
    [(0,tree[1]), (0, tree[2]), (0,"")]
end

function print_tree(tree::(Any,Any))
    
    for (depth,word ) in unfold_struct(tree)
        println("\t"^depth, word)
    end
end