In [9]:
using Compat
using Docile
using Iterators
using Pipe
using DataStructures

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

In [10]:
push!(LOAD_PATH, map(x->"../"*x, filter(fn-> !(contains(fn,".")),readdir("..")))...)


20-element Array{Union(ASCIIString,UTF8String),1}:
 "/root/buildFromSource/julia/usr/local/share/julia/site/v0.3"
 "/root/buildFromSource/julia/usr/share/julia/site/v0.3"      
 "../Corpus"                                                  
 "../doc2vec"                                                 
 "../Models"                                                  
 "../Optimisation"                                            
 "../recursive_embeddings"                                    
 "../summaristation"                                          
 "../tools"                                                   
 "../util"                                                    
 "../word-embedding3"                                         
 "../Corpus"                                                  
 "../doc2vec"                                                 
 "../Models"                                                  
 "../Optimisation"                                            
 "..

In [11]:
using WordEmbeddings

In [12]:
training = open("../Corpus/serialised/opinosis_train_dev_plain.jsz","r") do fs
    deserialize(fs)
end
@pz training

training		Array{Array{String,1},1}	(6097,)


In [13]:
const START_PAD_WORD = "*START*"
const END_WORD = "*END*"

function pad{S<:String}(sent::Vector{S}, padded_length)
    if length(sent) <= padded_length
        ret =  fill(START_PAD_WORD,padded_length)
        ret[end-length(sent)+1:end] = sent
        ret
    else
        sent
    end
end

function pad_advanced{S<:String}(sent::Vector{S}, window_length::Int)
    padded_length = 2 + max(window_length+1, length(sent)) # ALways have at least 1 start and end padding, so plus 2 elements
    ret =  fill(START_PAD_WORD,padded_length)
    ret[end]=END_WORD
    ret[end-length(sent)-1+1: end-1]=sent
    ret
end





pad_advanced (generic function with 1 method)

add_all_words! (generic function with 4 methods)

In [15]:
type PVDM{N<:Number, S<:AbstractString}
    we::WE
    pe::WE #use a word embedder for Paragraphs too
    
    W::AbstractMatrix{N}
    b::AbstractVector{N}

    window_length::Int
    varience::N
end

function PVDM{N,S}(we::WE{N,S}, window_length::Int, varience=0.001)
    
    emb_width,n_words = size(we.L)
    concat_layer_width = emb_width*(window_length+1)
    const W = convert(Matrix{N}, varience*randn(n_words,concat_layer_width))
    const b = convert(Vector{N}, varience*randn(n_words))
    
    pe = WE(N,Vector{S},emb_width)
    
    PVDM{N,S}( we, pe, W,b, window_length, varience)
end

@doc "Lightwieght version, that does not have support for lookups or additions" ->
function PVDM_light{N,S}(pvdm::PVDM{N,S})
    we = WE_light(pvdm.we)
    pe = WE_light(pvdm.pe)
    W = pvdm.W
    b = pvdm.b
    window_length = pvdm.window_length
    varience = nan(N)
    PVDM{N,S}( we, pe, W,b, window_length, varience)
end

@doc "Lightwieght version, that does not have support for lookups or additions, and has N converted type N2" ->
function PVDM_light{N,S}(pvdm::PVDM{N,S}, N2::DataType)
    we = WE_light(pvdm.we,N2)
    pe = WE_light(pvdm.pe, N2)
    W = convert(Matrix{N2},pvdm.W)
    b = convert(Vector{N2},pvdm.b)
    window_length = pvdm.window_length
    varience = convert(N2,NaN)
    PVDM{N2,S}( we, pe, W,b, window_length, varience)
end
    
    

LoadError: invalid redefinition of constant PVDM
while loading In[15], in expression starting on line 1

In [16]:
@doc """gets the training cases as vector of (paraIndex, [word_indexes], label_word_index),
cycling by the window length.
Adds the paragraph if it does not already have an index
""" ->
function get_para_training_cases!{S<:String}(pvdm::PVDM, para::Vector{S})
    para_ind = get_word_index!(pvdm.pe, para, pvdm.varience)
    
    Task() do 
        @assert length(para)>=pvdm.window_length+1
        for offset in 0:length(para)-(pvdm.window_length+1)
            window_iis = [1:pvdm.window_length;]+offset
            label_ii = pvdm.window_length+1+offset
            
            window_words = para[window_iis]
            label_word = para[label_ii]
                        
            windows_indexes = map(word->get_word_index(pvdm.we, word), window_words)
            label_index = get_word_index(pvdm.we, label_word)
            
            produce(Int64[para_ind, windows_indexes..., label_index])
        end
    end
    
end

get_para_training_cases! (generic function with 1 method)

In [17]:
function get_input_layer(pvdm::PVDM, para_index::Int, window_indexes::Vector{Int})
    @inbounds [pvdm.pe.L[:,para_index], vec(pvdm.we.L[:,window_indexes])]
end 

function get_input_layers{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    const emb_width = size(pvdm.we.L,1)
    const n_training = length(para_indexes)
    
    xs = Array(N,(emb_width * (pvdm.window_length+1),n_training))
    @inbounds xs[1:emb_width,:] = pvdm.pe.L[:,para_indexes]
    for training_case in 1:n_training
        @inbounds const window_indexes = window_indexeses[:,training_case]
        @inbounds xs[emb_width+1:end,training_case] = vec(pvdm.we.L[:,window_indexes])
    end
    xs
end

get_input_layers (generic function with 1 method)

In [18]:
function softmax(zs)
    (1./sum(exp(zs),1)).*exp(zs)
end

function feedforward{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    xs = get_input_layers(pvdm, para_indexes, window_indexeses)
    
    #Speed optimised version of `zs = pvdm.W*xs .+ pvdm.b`
    zs = pvdm.W*xs 
    const n_training = length(para_indexes)
    for ii in 1:n_training
        @inbounds zs[:,ii]+= pvdm.b
    end
    ŷs = softmax(zs)
    ŷs, xs
end

feedforward (generic function with 1 method)

In [19]:
function loss{I,N}(y_indexes::Vector{I}, ŷs::Matrix{N})
    @assert length(y_indexes)>0
    #C=−∑j yj*log ŷj,
    c = zero(N)
    for tc in 1:length(y_indexes)
        @inbounds c-=log(ŷs[y_indexes[tc],tc])
    end
    c/length(y_indexes)
end

loss (generic function with 1 method)

In [20]:
function backprop{N,S,I}(pvdm::PVDM{N,S}, y_indexes::Vector{I}, ŷs::Matrix{N}, xs::Matrix{N} , para_indexes::Vector{I}, window_indexeses::Matrix{I} )
    const emb_width = size(pvdm.we.L,1)
    const n_training = length(para_indexes)
    const window_len = pvdm.window_length 
    
    @assert length(n_training)>0
    
    #Δb = zeros(pvdm.b)
    #ΔW = zeros(pvdm.W)
    ΔL = zeros(pvdm.we.L) #Word Vector Changes
    ΔD = zeros(pvdm.pe.L) #Paragraph Vector Changes
    
        
    #speed optimistation of `δ_top_s = ŷs.-ys`
    δ_top_s = copy(ŷs)
    for tc in 1:length(y_indexes)
        @inbounds δ_top_s[y_indexes[tc],tc]-=one(typeof(ŷs[1]))
    end

    Δb = sum(δ_top_s,2) |> vec
    ΔW = (δ_top_s * xs')
    δ_input_s= (pvdm.W'*δ_top_s) #the activation function of the layer below dxs=d(1*D[ii];L[iis]) =1
    
    #Paragraph vector Error
    for ii in 1:n_training #Add sequentially, reather than via in a += as that would only allow one add for repreased index
        @inbounds ΔD[:,para_indexes[ii]] += δ_input_s[1:emb_width,ii]
    end
    
    #word vectors
    for ii in 1:n_training
        for ww in 1:window_len
            const offset=ww*emb_width
            @inbounds ΔL[:,window_indexeses[ww,ii]]+=δ_input_s[offset+1:offset+emb_width, ii]
        end
    end
    
   
    ΔL./n_training, ΔD./n_training, ΔW./n_training, Δb./n_training
end

backprop (generic function with 1 method)

In [21]:
@doc "This assumes the number of works and paragraphs known remains constant" ->
function unpack!(pvdm::PVDM, θ::Vector)
    start=0
    item=pvdm.we.L
    len_total=length(item)
    @inbounds pvdm.we.L = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.pe.L
    len_total+=length(item)
    @inbounds pvdm.pe.L = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.W 
    len_total+=length(item)
    @inbounds pvdm.W = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.b 
    len_total+=length(item)
    @inbounds pvdm.b = @pipe θ[1+start:len_total]
    
    pvdm
end


@doc "This assumes the number of works and paragraphs known remains constant" ->
function pack{N}(L::AbstractMatrix{N}, D::AbstractMatrix{N}, W::AbstractMatrix{N},b::AbstractVector{N})
    vcat(vec(L),vec(D), vec(W),b)
end

@doc "This assumes the number of works and paragraphs known remains constant" ->
function pack(pvdm::PVDM)
    pack(pvdm.we.L, pvdm.pe.L, pvdm.W, pvdm.b)
end


pack (generic function with 2 methods)

In [22]:
const WINDOW_LEN = 8 
training = Vector{String}[pad_advanced(para, WINDOW_LEN) for para in training]

we_outer = WE(Float32,String, 200)
add_all_words!(we_outer, training, 0.1)
const pvdm_outer = PVDM(we_outer, WINDOW_LEN, 0.1);
we_outer=0


const training_indexes = @pipe chain(map(para -> get_para_training_cases!(pvdm_outer, para), training)...) |> hcat(_...)
const para_indexes_o = training_indexes[1,:] |> vec
const window_indexes_o = training_indexes[2:end-1,:] 
const label_indexes_o = training_indexes[end,:] |> vec;



In [23]:
function loss!(θ::Vector)  
    #warn("loss! not defined")
    grad = similar(θ)
    cached_loss_and_loss_grad!(θ, grad)
end

function loss_grad!(θ::Vector, storage::Vector) 
    #warn("loss_grad not defined")
    cached_loss_and_loss_grad!(θ, grad)
end


function loss_and_loss_grad!(θ::Vector, grad::Vector)   
    grad[:] = 0
    unpack!(pvdm_outer, θ)
    
    ŷs,xs = feedforward(pvdm_outer, para_indexes_o, window_indexes_o)
    Δs  = backprop(pvdm_outer, label_indexes_o, ŷs,xs, para_indexes_o, window_indexes_o)
    grad[:] = pack(Δs...)
    loss(label_indexes_o, ŷs)
end



loss_and_loss_grad! (generic function with 1 method)

In [24]:
loss_and_loss_grad_cache = Dict{NumericVector,(Number, NumericVector)}()
loss_and_loss_grad_cache_hits = 0
loss_and_loss_grad_cache_misses = 0
function cached_loss_and_loss_grad!(θ::Vector, grad::Vector)
    global loss_and_loss_grad_cache
    global loss_and_loss_grad_cache_hits
    global loss_and_loss_grad_cache_misses
    if haskey(loss_and_loss_grad_cache,θ)
        loss_and_loss_grad_cache_hits+=1
        err, grad[:] = loss_and_loss_grad_cache[θ]
        err
    else
        loss_and_loss_grad_cache_misses+=1
        err = loss_and_loss_grad!(θ, grad)
        loss_and_loss_grad_cache[θ] = (err, grad)
        err
    end
end

cached_loss_and_loss_grad! (generic function with 1 method)

In [32]:
using Optim #https://github.com/JuliaOpt/Optim.jl
f=DifferentiableFunction(loss!,loss_grad!,cached_loss_and_loss_grad!)
#θ = pack(pvdm_outer)
θ=res.minimum
#θ=optx
@time res = optimize(f, θ, method=:cg, show_trace = true, store_trace = true, iterations = 20);
@printval res.f_calls 
@printval res.g_calls 
@printval res.iterations
@printval res.f_minimum
@printval res.f_converged 
@printval res.trace
@printval loss_and_loss_grad_cache_hits
@printval loss_and_loss_grad_cache_misses

Iter     Function value   Gradient norm 
     0     5.372543e+00     2.525859e-02
     1     5.348296e+00     1.087051e-02
     2     5.327250e+00     2.117648e-02
     3     5.301131e+00     2.745742e-02
     4     5.236059e+00     2.439390e-02
     5     5.152967e+00     2.285820e-02
     6     5.111018e+00     3.013088e-02
     7     5.039601e+00     6.645359e-02
     8     4.964079e+00     1.482462e-02
     9     4.930750e+00     5.614407e-02
    10     4.851865e+00     1.417908e-02
    11     4.816745e+00     3.000415e-02
    12     4.745643e+00     2.255416e-02
    13     4.699772e+00     1.087656e-02
    14     4.663737e+00     2.686708e-02
    15     4.586313e+00     3.745591e-02
    16     4.501931e+00     2.965717e-02
    17     4.434673e+00     2.103150e-02
    18     4.377546e+00     1.942718e-02
    19     4.303544e+00     4.502885e-02
    20     4.217514e+00     2.911804e-02
elapsed time: 3725.801402937 seconds (691079721816 bytes allocated, 8.92% gc time)
res.f_calls = 6

In [30]:
@printval res.f_calls 
@printval res.g_calls 
@printval res.iterations
@printval res.f_minimum
@printval res.gr_converged
@printval res.x_converged                       
@printval res.f_converged 
@printval res.trace
@printval loss_and_loss_grad_cache_hits
@printval loss_and_loss_grad_cache_misses

res.f_calls = 16
res.g_calls = 11
res.iterations = 5
res.f_minimum = 5.504724979400635
res.gr_converged = false
res.x_converged = false
res.f_converged = false
res.trace = Iter     Function value   Gradient norm 
------   --------------   --------------
     0     5.990041e+00     1.915355e-02
     1     5.914227e+00     1.475701e-02
     2     5.803874e+00     5.204834e-02
     3     5.728202e+00     1.351991e-02
     4     5.631781e+00     2.845515e-02
     5     5.504725e+00     3.959038e-02

loss_and_loss_grad_cache_hits = 56
loss_and_loss_grad_cache_misses = 119


In [None]:
)