In [1]:
ENV["LINES"] = 30
ENV["COLUMNS"] = 300

300

In [2]:
using Pipe
function pz(x :: AbstractArray)
    println(typeof(x), ": ", size(x))
end

pz (generic function with 1 method)

In [3]:
using PyCall
@pyimport nltk
function tokenize(sentence::String)
    convert(Array{String,1},nltk.word_tokenize(sentence))
end



tokenize (generic function with 1 method)

In [4]:
include("load_embeddings.jl")

load_embeddings (generic function with 1 method)

In [5]:
LL,word_indexes, indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
size(LL) |> println
word_indexes |> typeof |> println
indexed_words |> typeof |> println

(50,268810)
Dict{String,Int64}
Array{String,1}


In [6]:
typealias Embedding Vector{Float64}
typealias Embeddings Matrix{Float64}
typealias Words Union(AbstractArray{ASCIIString,1},AbstractArray{String,1})
type RAE
    L::Matrix{Float64}
    word_index::Dict{String,Int}
    indexed_words::Vector{String}
    
    W_e::Matrix{Float64}
    b_e::Vector{Float64}
    W_d::Matrix{Float64}
    b_d::Vector{Float64}
   
end


function RAE(L::Matrix{Float64},word_index::Dict{String,Int}, indexed_words::Vector{String})
    emb_width = size(L,1)
    
    W_e =0.01*randn(emb_width,emb_width*2) 
    b_e = 0.01*randn(emb_width) 
    #W_d = 0.01*randn(emb_width*2,emb_width)
    W_d = pinv(W_e) #Cheat (Actually why can't I always do this to initialize?);
    b_d = 0.01*randn(emb_width*2)
    
    RAE(L,word_index, indexed_words, W_e, b_e, W_d, b_d)
end


function get_word_index(rae::RAE, input::String, show_warn=true)
    if haskey(rae.word_index, input)
        ii = rae.word_index[input]
    elseif haskey(rae.word_index, lowercase(input))
        ii = rae.word_index[lowercase(input)]
    else
        ii = rae.word_index["*UNKNOWN*"]
        if show_warn
            println("$input not found. Defaulting.")
        end
    end
    ii
end


function eval_word_embedding(rae::RAE, input::String, show_warn=true)
    k=get_word_index(rae, input, show_warn)
    rae.L[:,k]
end

function eval_word_embeddings(rae::RAE, inputs::Words, show_warn=false)
    ks = @pipe inputs |> map(ii -> get_word_index(rae,ii, show_warn), _)
    rae.L[:,ks]
end




function eval_merges(rae::RAE, c_is::Embeddings, c_js::Embeddings)
    @assert size(c_is)==size(c_js)

    tanh(rae.W_e*[c_is;c_js].+rae.b_e)
end

function reconstruct(rae::RAE, pp::Embedding)
    ĉ_ijs = tanh(rae.W_d*pp+rae.b_d)
    ĉ_is = ĉ_ijs[1:end/2]
    ĉ_js = ĉ_ijs[end/2+1:end]
    ĉ_is, ĉ_js
end



function unfold_merges(rae::RAE, pps::Embeddings)
    ĉ_ijs = tanh(rae.W_d*pps .+ rae.b_d)
end

unfold_merges (generic function with 1 method)

In [25]:


function eval_scores(rae::RAE, c_is::Embeddings, c_js::Embeddings,
                     pps=eval_merges(rae, c_is, c_js)::Embeddings,
                     ĉ_ijs = unfold_merges(rae,pps)::Embeddings)
    c_ijs = [c_is;c_js]
    
    1/2*sum((c_ijs-ĉ_ijs).^2,1)
end
#A better scoring function is given in SorcherRAE (eaquation 6)


function eval_scores_gradient(rae::RAE, 
                              c_ijs::Embeddings,
                              pps::Embeddings,
                              ĉ_ijs::Embeddings)
    #http://neuralnetworksanddeeplearning.com/chap2.html
    N = size(c_ijs,2)
    
    da = (ĉ_ijs - c_ijs)
    dz_d = (1-ĉ_ijs.^2)
    δ_d = da.*dz_d
    
    #loop below if deep    
    dz_e = (1-pps.^2)
    δ_e = (rae.W_d'*δ_d).*dz_e 
    
    
    ∇W_d = 1/N*δ_d*pps'
    ∇b_d = 1/N*sum(δ_d,2)[:]
    ∇W_e = 1/N*δ_e*c_ijs'
    ∇b_e = 1/N*sum(δ_e,2)[:]
    
    
    ∇W_e, ∇b_e, ∇W_d, ∇b_d
end



eval_scores_gradient (generic function with 1 method)

In [None]:
function bptt(rae::RAE, tree::(String,String), 
                        c_ijs::Embeddings,
                        pps::Embeddings,
                        ĉ_ijs::Embeddings)
    ĉ_is, ĉ_js = reconstruct(rae, pp)
    ĉ_ijs=[ĉ_is ĉ_js]
end


function unfold(rae::RAE, tree::(Any,String), pp::Embedding)
    p̂_is, ĉ_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    [ĉ_is ĉ_js]
end

function unfold(rae::RAE, tree::(String,Any), pp::Embedding)
    ĉ_is, p̂_js = reconstruct(rae, pp)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
    
end

function unfold(rae::RAE, tree::(Any,Any), pp::Embedding)
    p̂_is, p̂_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
end


In [26]:
function eval_to_tree(rr::RAE,sentence::String)
    eval_to_tree(rr, tokenize(sentence))
end

function eval_to_tree(rr::RAE, sentence::Words)
    tree = tuple(sentence...)
    cs = eval_word_embeddings(rr, sentence)
    score_total = 0.0
    while(size(cs,2)>1)
        c_is = cs[:, 1:end-1]
        c_js = cs[:, 2:end]
        
        pps = eval_merges(rr, c_is, c_js)
        scores = eval_scores(rr, c_is, c_js, pps)
        im = indmax(scores)
        
        score_total+=scores[im]
         
        cs = [cs[:,1:im-1] pps[:,im] cs[:,im+2:end]]
        tree = tuple(tree[1:im-1]..., (tree[im], tree[im+1]), tree[im+2:end]...)
    end
    tree = tree[1] #The final step creates a tuple containing one element, as first and last parts are empty
    embedding = cs[:]
    tree, embedding, score_total
end


eval_to_tree (generic function with 2 methods)

In [27]:
rr = RAE(LL,word_indexes,indexed_words);

sent = "the boy destroyed the house"
sent_toks = tokenize(sent)

tree, pp, score_total = eval_to_tree(rr,sent_toks)

(((("the","boy"),"destroyed"),("the","house")),[0.0120591,-0.00703663,0.0107588,0.0230025,0.00931222,0.00811373,-0.00149554,-0.0184021,0.0270022,-0.00141471  …  -0.000454066,0.0127258,0.00559064,0.0118492,-0.000726663,0.00856643,-0.0152987,0.00131534,0.00877025,0.00671089],6.99439944213192)

In [21]:
using Iterators
@pyimport nltk.corpus as nltk_corpus
n_training = 800
#training_sents = @pipe nltk_corpus.brown[:sents]() |> take(_,n_training)  |> collect |> convert(Vector{Vector{String}},_);
training_sents = @pipe nltk_corpus.brown[:sents]() |> filter(s->length(s)==2, _) |> take(_,n_training)  |> collect |> convert(Vector{Vector{String}},_);


In [22]:
rae_outer = RAE(LL,word_indexes,indexed_words);
ts = hcat(training_sents...)'
c_is= @pipe ts[:,1] |> eval_word_embeddings(rae_outer,_) 
c_js= @pipe ts[:,2] |> eval_word_embeddings(rae_outer,_);
size(c_js)

(50,730)

In [29]:
using Optim #https://github.com/JuliaOpt/Optim.jl

rae_outer = RAE(LL,word_indexes,indexed_words);

function unpack!(rae::RAE, θ::Vector)
    W_e_len = length(rae.W_e)
    b_e_len = length(rae.b_e)
    W_d_len = length(rae.W_d)
    b_d_len = length(rae.b_d)
    W_e_shape = size(rae.W_e)
    W_d_shape = size(rae.W_d)
    
    rae.W_e = reshape(θ[1: W_e_len],W_e_shape)
    rae.b_e = θ[W_e_len+1: W_e_len+b_e_len]
    rae.W_d = reshape(θ[W_e_len+b_e_len+1: W_e_len+b_e_len+W_d_len],W_d_shape)
    rae.b_d = θ[W_e_len+b_e_len+W_d_len+1: end]
    
    rae
end

function pack(rae::RAE)
    [rae.W_e[:],rae.b_e, rae.W_d[:],rae.b_d[:]] 
end

function pack(∇W_e::Matrix{Float64}, ∇b_e::Vector{Float64}, ∇W_d::Matrix{Float64}, ∇b_d::Vector{Float64})
    [∇W_e[:], ∇b_e, ∇W_d[:], ∇b_d] 
end

#--------------------------------------------------------

function loss!(θ::Vector)  
    rae = unpack!(rae_outer, θ)
    pps=eval_merges(rae, c_is, c_js) 
    err=eval_scores(rae, c_is, c_js,pps)  # score all merges
    
    #err =@pipe training_sents |> map( ss-> eval_to_tree(rae, ss)[3], _) |> mean
    
    err
end

function loss_grad!(θ::Vector, storage::Vector)   
    rae = unpack!(rae_outer, θ)
    c_ijs=[c_is;c_js]
    pps=eval_merges(rae, c_is, c_js)
    ĉ_ijs = unfold_merges(rae, pps)
    ∇s = eval_scores_gradient(rae,ĉ_ijs,pps,c_ijs)
    
    storage[:] = pack(∇s...)[:]
end

function loss_and_loss_grad!(θ::Vector, storage::Vector)   
    rae = unpack!(rae_outer, θ)
    c_ijs=[c_is;c_js]
    pps=eval_merges(rae, c_is, c_js) 
    ĉ_ijs = unfold_merges(rae, pps)
    err=eval_scores(rae, c_is, c_js, pps, ĉ_ijs) |> mean
    
    ∇s = eval_scores_gradient(rae,c_ijs,pps,ĉ_ijs)
    storage[:] = pack(∇s...)[:]
    err
end

f=DifferentiableFunction(loss!,loss_grad!,loss_and_loss_grad!)
#Must provide Graident as finite difference requires ~length(θ) calls to f
res = optimize(f, pack(rae_outer), method=:l_bfgs, show_trace = true,iterations = 1000)
rae_outer = unpack!(rae_outer, res.minimum);
print("---------------------------")

Iter     Function value   Gradient norm 
     0     3.643091e+00     6.565683e+00
     1     3.512624e+00     3.145956e+00
     2     3.432414e+00     2.513818e+00
     3     3.381413e+00     2.282457e+00
     4     3.358298e+00     2.415764e+00
     5     3.344877e+00     1.971613e+00
     6     3.330361e+00     2.103694e+00
     7     3.314535e+00     2.293344e+00
     8     3.304756e+00     2.484443e+00
     9     3.288787e+00     2.099002e+00
    10     3.268598e+00     1.802348e+00
    11     3.256054e+00     2.124679e+00
    12     3.227228e+00     6.369441e-01
    13     3.209265e+00     8.788510e-01
    14     3.194077e+00     9.373089e-01
    15     3.174564e+00     9.758307e-01
    16     3.148718e+00     1.306796e+00
    17     3.129229e+00     9.769877e-01
    18     3.095154e+00     1.217260e+00
    19     3.048212e+00     1.129759e+00
    20     2.960644e+00     2.038510e+00
    21     2.872059e+00     2.007549e+00
    22     2.767416e+00     1.283971e+00
    23     2.685

https://github.com/JuliaLang/julia/blob/master/doc/manual/profile.rst Actual instructions on profiling



In [None]:
Profile.clear()
@profile f(pack(rae_outer))


In [None]:
using ProfileView
ProfileView.view()

In [32]:
#tree data in tree is not use, other than it's structure.
#((("the","house"),("destroyed",("the","boy")))  is equivalent to ((("",""),("",("",""))) 



function unfold(rae::RAE, tree::(String,String), pp::Embedding)
    ĉ_is, ĉ_js = reconstruct(rae, pp)
    [ĉ_is ĉ_js]
end


function unfold(rae::RAE, tree::(Any,String), pp::Embedding)
    p̂_is, ĉ_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    [ĉ_is ĉ_js]
end

function unfold(rae::RAE, tree::(String,Any), pp::Embedding)
    ĉ_is, p̂_js = reconstruct(rae, pp)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
    
end

function unfold(rae::RAE, tree::(Any,Any), pp::Embedding)
    p̂_is, p̂_js = reconstruct(rae, pp)
    ĉ_is = unfold(rae, tree[1], p̂_is)
    ĉ_js = unfold(rae, tree[2], p̂_js)
    [ĉ_is ĉ_js]
end

    

unfold (generic function with 4 methods)

In [33]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


function show_best(rae::RAE,ĉ::Embedding, nbest=20)
    candidates=neighbour_dists(ĉ,rae.L)   
    best_cands = [ (findfirst(candidates,score), score)
                    for score in select(candidates,1:nbest, rev=true)[1:nbest]]
    vcat([[rae.indexed_words[ii] score] for (ii,score) in best_cands]...)
end

function show_bests(rae::RAE,ĉs::Embeddings, nbest=20)
    hcat([show_best(rae,ĉs[:,ii],nbest) for ii in 1:size(ĉs,2)]...)
end


show_bests (generic function with 2 methods)

In [41]:
tree, pp, score_total = eval_to_tree(rae_outer,"easy holdings")
ĉs = unfold(rae_outer,tree,pp)

show_bests(rae_outer, ĉs)


20x4 Array{Any,2}:
 "easy"           0.816017  "holdings"       0.82285 
 "patient"        0.700192  "reserves"       0.772096
 "OK"             0.671654  "portfolios"     0.770776
 "simple"         0.661516  "borrowings"     0.764627
 "appropriate"    0.661285  "offerings"      0.761199
 "impossible"     0.66097   "expenses"       0.759594
 "silly"          0.646397  "expenditure"    0.759503
 "unpleasant"     0.645093  "dividends"      0.752636
 "whole"          0.639919  "surpluses"      0.750342
 "conclusive"     0.639644  "registrations"  0.748892
 "legitimate"     0.634043  "repos"          0.748398
 "inappropriate"  0.633703  "receipts"       0.742331
 "reasonable"     0.628542  "credits"        0.732735
 "true"           0.627107  "certificates"   0.730689
 "consistent"     0.627092  "expenditures"   0.726706
 "objective"      0.625305  "overheads"      0.723999
 "effective"      0.622179  "invesment"      0.721462
 "free-spending"  0.620832  "profits"        0.718757
 "differe

In [None]:
function depth_inc(ele::(Int,String))
    (ele[1]+1,ele[2])
end

function unfold_struct(tree::(Any,Any))
    left_tree = unfold_struct(tree[1]) 
    left = @pipe left_tree |> map(depth_inc,_)
    right_tree = unfold_struct(tree[2]) 
    right = @pipe right_tree |> map(depth_inc,_)
    [left, right, (0,"")]
end

function unfold_struct(tree::(Any,String))
    left_tree = unfold_struct(tree[1]) 
    left = @pipe left_tree |> map(depth_inc,_)
    [left, (0,tree[2]), (0,"")]
end
function unfold_struct(tree::(String,Any))
    right_tree = unfold_struct(tree[2]) 
    right = @pipe right_tree |> map(depth_inc,_)
    [(0,tree[1]),right, (0,"")]
end
function unfold_struct(tree::(String,String))
    [(0,tree[1]), (0, tree[2]), (0,"")]
end

function print_tree(tree::(Any,Any))
    
    for (depth,word ) in unfold_struct(tree)
        println("\t"^depth, word)
    end
end