In [1]:
using DataStructures
using Base.Collections
using Iterators
using Pipe
function pz(x :: AbstractArray)
    println(typeof(x), ": ", size(x))
end
macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

In [2]:
using RecursiveAutoencoders

In [3]:
include("load_embeddings.jl")
LL,word_indexes, indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
size(LL) |> println
word_indexes |> typeof |> println
indexed_words |> typeof |> println

(50,268810)
Dict{String,Int64}
Array{String,1}


In [4]:
training_trees = open("training_sents.jsz","r") do fs
    deserialize(fs)
end;

In [5]:
abstract Side
immutable Left<:Side
end

immutable Right<:Side
end

immutable NoSide<:Side
end

immutable FoldData
    p_out::Embedding
    left::Union(FoldData,Embedding)
    right::Union(FoldData,Embedding)
end

immutable UnfoldData{T<:Side}
    p_in::Embedding
    parent::Union(FoldData,UnfoldData)
    ĉ_i::Embedding
    ĉ_j::Embedding
    depth::Int64
end

immutable UnfoldLeaf{T<:Side}
    ĉ::Embedding
    parent::UnfoldData
    c::Embedding
    depth::Int64
end



In [6]:
function get_side{T}(::Union(UnfoldLeaf{T}, UnfoldData{T}))
    T()
end

get_side (generic function with 1 method)

In [7]:
function emb(data::FoldData)
    data.p_out
end
function emb(data::Embedding)
    data
end

function fold(rae::RAE, tree::(Any,Any))
    function eval_child(child::String)
        c=eval_word_embedding(rae,child,false)
        c
    end
    function eval_child(c::Embedding)
        c
    end
    function eval_child(child::Any)
        fold(rae,child)
    end
    
   
    left = eval_child(tree[1])
    right = eval_child(tree[2])
    p=eval_merge(rae, emb(left), emb(right))
    FoldData(p, left, right)   
end

fold (generic function with 1 method)

In [8]:
function unfold{T}(rae::RAE, c::Embedding, ĉ::Embedding, parent, ::Type{T}, depth)
    UnfoldLeaf{T}(ĉ, parent, c, depth)
end


function unfold{T}(rae::RAE, act::FoldData, p_in::Embedding, parent, ::Type{T}, depth::Int)
    #Side is a ignored argument. This could be replaced with a generated function
    ĉ_i, ĉ_j = reconstruct(rae,p_in)
    data = UnfoldData{T}(p_in, parent, ĉ_i, ĉ_j,depth)
    
    left = unfold(rae, act.left, ĉ_i, data, Left, depth+1)
    right= unfold(rae, act.right, ĉ_j, data, Right, depth+1)
    [left; right]
end

function unfold(rae::RAE, act::FoldData)
    #Handle the top case
    unfold(rae, act,act.p_out,act, NoSide,0)
end

unfold (generic function with 3 methods)

In [9]:
# tests
function test()
    rr = RAE(LL,word_indexes,indexed_words);

    a=fold(rr,("killer", "cows"))
    b=unfold(rr,a);
    @assert b[1].parent==b[2].parent


    a=fold(rr,("the",("killer", "cows")))
    b=unfold(rr,a);
    @assert b[1].parent==b[2].parent.parent==b[3].parent.parent
end
test()

In [10]:
function δ(a::Embedding, δ_above::Vector{Float64}, W::Matrix{Float64})
    #a is the ouput of this layer: a=tanh(z) where z is the input from layer below
    #W is matrix to move to above layer, from this one
    dz = 1-a.^2 #Derivitive of a=tanh(z)
    (W'*δ_above).*dz
end

function δ(ĉ_ij::Embedding,c_ij::Embedding) 
    #Output Layer
    M = length(c_ij)# ==length(ĉ_ij)
    dz = 1-ĉ_ij.^2
    δ_above = -(c_ij-ĉ_ij)
    δ_above.*dz
    #δ(ĉ_ij,δ_above, eye(M))     
end


δ (generic function with 2 methods)

In [11]:
function sidepad(d::Vector{Float64}, ::Left)
    padding=zeros(size(d))
    [d, padding]
end
function sidepad(d::Vector{Float64}, ::Right)
    padding=zeros(size(d))
    [padding, d]
end

function sidepad(d::Vector{Float64}, ::NoSide)
    d
end

function unsidepad(d::Vector{Float64}, ::Left)
    d[1:end/2]
end
function unsidepad(d::Vector{Float64}, ::Right)
    d[end/2+1:end]
end

function unsidepad(d::Vector{Float64}, ::NoSide)
    d
end

unsidepad (generic function with 3 methods)

In [12]:
function UBPTS(rae::RAE, nodes::Vector{UnfoldLeaf} )
    parent_deltas = Dict{UnfoldData, Vector{Float64}}()
    function add!(parent_node, delta)
        if haskey(parent_deltas, parent_node)
            parent_deltas[parent_node]+=delta
        else
            parent_deltas[parent_node]=delta
        end
    end
    
    for leaf in nodes
        δ_node = δ(leaf.ĉ,leaf.c)
        δ_padded = sidepad(δ_node, get_side(leaf))
        add!(leaf.parent, δ_padded)
    end
        
    UBPTS(rae,parent_deltas)
end

function UBPTS(rae::RAE, parent_deltas::Dict{UnfoldData,Vector{Float64}})
    foldnode = nothing
    δ_above_fold = 0
    
    pending_nodes = PriorityQueue{UnfoldData, Int64}(Base.Order.Reverse)
    enqueue!(node::UnfoldData) = pending_nodes[node] = node.depth #Priority of node.depth (syntax on julia Priority queues is weird)
    map(enqueue!, keys(parent_deltas)) #Add all that were passed, as none have been processed
    
    function pend!(parent_node::UnfoldData, δ_node::Vector{Float64})
        if !haskey(parent_deltas,parent_node)
            enqueue!(parent_node) #then also hasn't been enque
            parent_deltas[parent_node]=δ_node
        else
            parent_deltas[parent_node]+=δ_node
        end
    end
        
    function pend!(node::FoldData, δ_node::Vector{Float64})
        foldnode = node
        δ_above_fold+=δ_node
    end

    ΔW_d=0 #will broadcast
    Δb_d=0 
    while !isempty(pending_nodes)
        node = dequeue!(pending_nodes)
        δ_above =  parent_deltas[node]
        #Note: node.p_in= suitable half of node.parent.ĉ_i or node.parent.ĉ_j
        #      The line below takes a lot of thinking to be sure it is right
        δ_node = δ(node.p_in, δ_above, rae.W_d)

        δ_padded = sidepad(δ_node, get_side(node))
        
        
        ΔW_d += δ_above*node.p_in'
        Δb_d += δ_above

        
        pend!(node.parent,δ_padded)
    end

    (δ_above_fold, ΔW_d, Δb_d)
end


UBPTS (generic function with 2 methods)

In [13]:
function UBPTS(rae::RAE, node::FoldData, δ_above::Vector{Float64})
    c_i=emb(node.left)
    c_j=emb(node.right)
    a= [c_i; c_j]
    
    δ_node =  δ(a, δ_above, rae.W_e)
    
    δ_left = δ_node[1:end/2]
    δ_right = δ_node[end/2+1 : end]
    
                   
    ΔW_e=δ_above*a'
    Δb_e=δ_above   
    
    
    ΔW_e_left, Δb_e_left = UBPTS(rae, node.left, δ_left)
    ΔW_e_right, Δb_e_right = UBPTS(rae, node.right, δ_right)
    (ΔW_e+ΔW_e_left+ΔW_e_right, Δb_e+Δb_e_left+Δb_e_right)
end

function UBPTS(rae::RAE, node::Embedding, δ_above::Vector{Float64})
    0,0,0 # Nothing to learn here (at least until we start learning rae.L)
end


UBPTS (generic function with 4 methods)

In [14]:
function test()
    rr = RAE(LL,word_indexes,indexed_words);
    a=fold(rr,("the",("bad",("killer", "cows"))))
    b=unfold(rr,a);

    δd,ΔW_d, Δb_d = UBPTS(rr, b)
    println("-"^54)
    @pz rr.W_d
    @pz ΔW_d
    @pz rr.b_d
    @pz Δb_d

    ΔW_e,Δb_e = UBPTS(rr, a, δd)
    println("+"^54)
    @pz rr.W_e
    @pz ΔW_e
    @pz rr.b_e
    @pz Δb_e

end
test()


------------------------------------------------------
rr.W_d		Array{Float64,2}	(100,50)
ΔW_d		Array{Float64,2}	(100,50)
rr.b_d		Array{Float64,1}	(100,)
Δb_d		Array{Float64,1}	(100,)
++++++++++++++++++++++++++++++++++++++++++++++++++++++
rr.W_e		Array{Float64,2}	(50,100)
ΔW_e		Array{Float64,2}	(50,100)
rr.b_e		Array{Float64,1}	(50,)
Δb_e		Array{Float64,1}	(50,)


In [15]:
function RecursiveAutoencoders.eval_word_embeddings(rae::RAE, tree::(Any,Any))
    function eval_child(child::String)
        eval_word_embedding(rae,child,false)
    end
    function eval_child(child::Any)
        eval_word_embeddings(rae,child)
    end
    c_i = eval_child(tree[1])
    c_j = eval_child(tree[2])
    [c_i c_j]
end

eval_word_embeddings (generic function with 3 methods)

In [16]:
function loss(unfold_leaves::Vector{UnfoldLeaf})
    map(unfold_leaves) do leaf
        0.5*(leaf.c-leaf.ĉ).^2 |> sum
        end |> sum 
end

function loss(rae::RAE, tree::(Any,Any))
    fold_tree = fold(rae, tree)
    unfold_leaves = unfold(rae, fold_tree)
    loss(unfold_leaves)
end


function loss_and_loss_grad(rae::RAE, tree::(Any,Any))
    fold_tree = fold(rae, tree)
    unfold_leaves = unfold(rae, fold_tree)
    err=loss(unfold_leaves)

    δd,∇W_d, ∇b_d = UBPTS(rae, unfold_leaves)
    ∇W_e,∇b_e = UBPTS(rae, fold_tree, δd)

    Δs = (∇W_e, ∇b_e, ∇W_d, ∇b_d)
    (Δs, err)
end

loss_and_loss_grad (generic function with 1 method)

In [17]:
@everywhere function unpack!(rae::RAE, θ::Vector)
    W_e_len = length(rae.W_e)
    b_e_len = length(rae.b_e)
    W_d_len = length(rae.W_d)
    b_d_len = length(rae.b_d)
    W_e_shape = size(rae.W_e)
    W_d_shape = size(rae.W_d)
    
    rae.W_e = reshape(θ[1: W_e_len],W_e_shape)
    rae.b_e = θ[W_e_len+1: W_e_len+b_e_len]
    rae.W_d = reshape(θ[W_e_len+b_e_len+1: W_e_len+b_e_len+W_d_len],W_d_shape)
    rae.b_d = θ[W_e_len+b_e_len+W_d_len+1: end]
    
    rae
end

@everywhere function pack(rae::RAE)
    pack(rae.W_e,rae.b_e, rae.W_d,rae.b_d)
end

@everywhere function pack(∇W_e::Matrix{Float64}, ∇b_e::Vector{Float64}, ∇W_d::Matrix{Float64}, ∇b_d::Vector{Float64})
    [∇W_e[:], ∇b_e, ∇W_d[:], ∇b_d] 
end


In [19]:
function analytic_grad(rae::RAE, tree::(Any,Any))
    (Δs, err)=loss_and_loss_grad(rae, tree)
    tuple(Δs...)
end
 
function numeric_grad(rae::RAE, tree::(Any,Any), ϵ=10.0^-4)
    rae_inner = deepcopy(rae)
    θ = pack(rae_inner)
    Δθ = zeros(size(θ))
    for ii in 1:length(θ)
        ϵᵢ = zeros(size(θ))
        ϵᵢ[ii]=ϵ
        θⁱ⁺ = θ + ϵᵢ
        θⁱ⁻ = θ - ϵᵢ
        
        unpack!(rae_inner,θⁱ⁺)
        Jⁱ⁺ = loss(rae_inner, tree)
        
        unpack!(rae_inner,θⁱ⁻)
        Jⁱ⁻=loss(rae_inner, tree)
        Δθ[ii] = (Jⁱ⁺-Jⁱ⁻)/(2.0*ϵ)
    end
    unpack!(rae_inner,Δθ)
    
    (rae_inner.W_e, rae_inner.b_e, rae_inner.W_d, rae_inner.b_d)
    
    
end

numeric_grad (generic function with 2 methods)

In [18]:
eg_tree = ("the",("dangerous",("killer", "cows")))
rae_outer = RAE(LL,word_indexes,indexed_words);
#nW_e, nb_e, nW_d, nb_d = numeric_grad(rae_outer, eg_tree, 10.0^-7);
aW_e, ab_e, aW_d, ab_d = analytic_grad(rae_outer, eg_tree);



LoadError: numeric_grad not defined
while loading In[18], in expression starting on line 3

In [None]:


ngs = nW_e[:]
ags = aW_e[:]

@printval mean(abs(ngs-ags))
@printval maximum(abs(ngs-ags))
@printval norm(ngs-ags)
println("-"^54)

for (ng,ag) in zip(ngs, ags)
    println(ng,"\t", ag, "\t", abs(ng-ag))
end



In [22]:
using Optim #https://github.com/JuliaOpt/Optim.jl


function loss!(θ::Vector)  
    error("loss! not defined")
end

function loss_grad!(θ::Vector, storage::Vector) 
    error("loss_grad not defined")
end

rae_outer = RAE(LL,word_indexes,indexed_words);

function loss_and_loss_grad!(θ::Vector, grad::Vector)   
    grad[:] = 0
    unpack!(rae_outer, θ)
    
    
    function loss_and_loss_grad_single(tree::(Any,Any))
        Δs, err = loss_and_loss_grad(rae_outer, tree)
        [pack(Δs...), err]
    end
    
    ret = map(loss_and_loss_grad_single, training_trees)|> sum 
    grad[:] = ret[1:end-1]
    err=ret[end]
    
    grad[:]/=length(training_trees)
    err/=length(training_trees)
    err
end
f=DifferentiableFunction(loss!,loss_grad!,loss_and_loss_grad!)

DifferentiableFunction(loss!,loss_grad!,loss_and_loss_grad!)

In [None]:
res = optimize(f, pack(rae_outer), method=:l_bfgs, show_trace = true, store_trace = true, iterations = 2);

Iter     Function value   Gradient norm 
 

In [None]:

@printval res.f_calls 
@printval res.g_calls 
@printval res.x_converged 
@printval res.iterations
@printval res.f_minimum
@printval res.gr_converged
@printval res.trace


In [None]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


function show_best(rae::RAE,ĉ::Embedding, nbest=20)
    candidates=neighbour_dists(ĉ,rae.L)   
    best_cands = [ (findfirst(candidates,score), score)
                    for score in select(candidates,1:nbest, rev=true)[1:nbest]]
    vcat([[rae.indexed_words[ii] round(score,2)] for (ii,score) in best_cands]...)
end

function show_bests(rae::RAE,ĉs::Embeddings, nbest=20)
    hcat([show_best(rae,ĉs[:,ii],nbest) for ii in 1:size(ĉs,2)]...)
end


In [None]:
bs = show_bests(rae, ĉ_ij)
bs[1,:][1:2:end]

In [None]:
ĉ_ij