In [1]:
using Compat
using Iterators
using Pipe
using DataStructures

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

In [2]:
push!(LOAD_PATH, map(x->"../"*x, filter(fn-> !(contains(fn,".")),readdir("..")))...)


11-element Array{Union{UTF8String,ASCIIString},1}:
 "/root/buildFromSource/julia0.4/usr/local/share/julia/site/v0.4"
 "/root/buildFromSource/julia0.4/usr/share/julia/site/v0.4"      
 "../Corpus"                                                     
 "../doc2vec"                                                    
 "../Models"                                                     
 "../Optimisation"                                               
 "../recursive_embeddings"                                       
 "../summaristation"                                             
 "../tools"                                                      
 "../util"                                                       
 "../word-embedding3"                                            

In [3]:
function hcat_no_splatting(xss::Vector{Any}) #Not actually for vector Any, only really for Vector{Vector{Any}}
    ncols = length(xss)
    nrows = length(first(xss))
    S = xss|>first|>eltype
    ret = Array(S,(nrows,ncols))
    for ii in eachindex(xss)
        @inbounds col = xss[ii]
        #@assert(length(col)==nrows)
        @inbounds ret[:,ii] = col
    end
    ret
end

hcat_no_splatting (generic function with 1 method)

In [4]:
using WordEmbeddings

In [5]:
training = open("../Corpus/serialised/opinosis_train_dev_plain.jsz","r") do fs
    deserialize(fs)
end
@pz training

training		

In [6]:
const START_PAD_WORD = "*START*"
const END_WORD = "*END*"

function pad{S<:String}(sent::Vector{S}, padded_length)
    if length(sent) <= padded_length
        ret =  fill(START_PAD_WORD,padded_length)
        ret[end-length(sent)+1:end] = sent
        ret
    else
        sent
    end
end

function pad_advanced{S<:String}(sent::Vector{S}, window_length::Int)
    padded_length = 2 + max(window_length+1, length(sent)) # ALways have at least 1 start and end padding, so plus 2 elements
    ret =  fill(START_PAD_WORD,padded_length)
    ret[end]=END_WORD
    ret[end-length(sent)-1+1: end-1]=sent
    ret
end



pad_advanced (generic function with 1 method)

In [7]:
type PVDM{N<:Number, S<:AbstractString}
    we::WE
    pe::WE #use a word embedder for Paragraphs too
    
    W::AbstractMatrix{N}
    b::AbstractVector{N}

    window_length::Int
    varience::N
end

Array{Array{UTF8String,1},1}	(6097,)


In [8]:


function PVDM{N,S}(we::WE{N,S}, window_length::Int, varience=0.001, output_width::Int=size(we.L,2))
    emb_width = size(we.L,1)
    concat_layer_width = emb_width*(window_length+1)
    const W = convert(Matrix{N}, varience*randn(output_width,concat_layer_width))
    const b = convert(Vector{N}, varience*randn(output_width))
    pe = WE(N,Vector{S},emb_width)
    
    PVDM{N,S}( we, pe, W,b, window_length, varience)
end

@doc "Lightwieght version, that does not have support for lookups or additions" ->
function PVDM_light{N,S}(pvdm::PVDM{N,S})
    we = WE_light(pvdm.we)
    pe = WE_light(pvdm.pe)
    W = pvdm.W
    b = pvdm.b
    window_length = pvdm.window_length
    varience = nan(N)
    PVDM{N,S}( we, pe, W,b, window_length, varience)
end

@doc "Lightwieght version, that does not have support for lookups or additions, and has N converted type N2" ->
function PVDM_light{N,S}(pvdm::PVDM{N,S}, N2::DataType)
    we = WE_light(pvdm.we,N2)
    pe = WE_light(pvdm.pe, N2)
    W = convert(Matrix{N2},pvdm.W)
    b = convert(Vector{N2},pvdm.b)
    window_length = pvdm.window_length
    varience = convert(N2,NaN)
    PVDM{N2,S}( we, pe, W,b, window_length, varience)
end
    
    

PVDM_light (generic function with 2 methods)

In [9]:
@doc """gets the training cases as vector of (paraIndex, [word_indexes], label_word_index),
cycling by the window length.
Adds the paragraph if it does not already have an index
""" ->
function get_para_training_cases!{S<:String}(pvdm::PVDM, para::Vector{S})
    para_ind = get_word_index!(pvdm.pe, para, pvdm.varience)
    
    Task() do 
        @assert length(para)>=pvdm.window_length+1
        for offset in 0:length(para)-(pvdm.window_length+1)
            window_iis = [1:pvdm.window_length;]+offset
            label_ii = pvdm.window_length+1+offset
            
            window_words = para[window_iis]
            label_word = para[label_ii]
                        
            windows_indexes = map(word->get_word_index(pvdm.we, word), window_words)
            label_index = get_word_index(pvdm.we, label_word)
            
            produce(Int64[para_ind, windows_indexes..., label_index])
        end
    end
    
end

get_para_training_cases! (generic function with 1 method)

In [10]:
function get_input_layer(pvdm::PVDM, para_index::Int, window_indexes::Vector{Int})
    @inbounds [pvdm.pe.L[:,para_index], vec(pvdm.we.L[:,window_indexes])]
end 

function get_input_layers{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    const emb_width = size(pvdm.we.L,1)
    const n_training = length(para_indexes)
    
    xs = Array(N,(emb_width * (pvdm.window_length+1),n_training))
    @inbounds xs[1:emb_width,:] = pvdm.pe.L[:,para_indexes]
    for training_case in 1:n_training
        @inbounds const window_indexes = window_indexeses[:,training_case]
        @inbounds xs[emb_width+1:end,training_case] = vec(pvdm.we.L[:,window_indexes])
    end
    xs
end

get_input_layers (generic function with 1 method)

In [11]:
function feedforward{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I}, output_activation::Function)
    xs = get_input_layers(pvdm, para_indexes, window_indexeses)
    
    #Speed optimised version of `zs = pvdm.W*xs .+ pvdm.b`
    zs = pvdm.W*xs 
    const n_training = length(para_indexes)
    for ii in 1:n_training
        @inbounds zs[:,ii]+= pvdm.b
    end
    ŷs = output_activation(zs)
    ŷs, xs
end

feedforward (generic function with 1 method)

In [12]:
function softmax(zs)
    (1./sum(exp(zs),1)).*exp(zs)
end

function loss_softmax{I,N}(y_indexes::Vector{I}, ŷs::Matrix{N})
    @assert length(y_indexes)>0
    #C=−∑j yj*log ŷj,
    c = zero(N)
    for tc in 1:length(y_indexes)
        @inbounds c-=log(ŷs[y_indexes[tc],tc])
    end
    c/length(y_indexes)
end

function feedforward_softmax{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    feedforward(pvdm, para_indexes, window_indexeses, softmax)
end

function backprop_softmax{N,S,I}(pvdm::PVDM{N,S}, y_indexes::Vector{I}, ŷs::Matrix{N}, xs::Matrix{N} , para_indexes::Vector{I}, window_indexeses::Matrix{I})
    #speed optimistation of `δ_top_s = ŷs.-ys`
    δ_top_s = copy(ŷs)
    for tc in 1:length(y_indexes)
        @inbounds δ_top_s[y_indexes[tc],tc]-=one(typeof(ŷs[1]))
    end

    backprop(pvdm, δ_top_s, ŷs, xs, para_indexes, window_indexeses)
end



backprop_softmax (generic function with 1 method)

In [13]:
@fastmath function sigmoid(zs) 
    #1./(1.0+e.^-zs)
    #Speed op of 1./(1+e.^-zs) 
    ret = similar(zs)
    @simd for ii in eachindex(ret)
        @inbounds ret[ii]=1./exp(-zs[ii]+1.0) #
    end
    ret
end

function loss_hierarchical_softmax{N}(y_codes::Vector{BitVector}, ŷs::Matrix{N})
    c = zero(N)
    for tc in 1:size(ŷs,2)
        const y_code= y_codes[tc]
        for d in eachindex(y_code)
            const y = y_code[d] ? zero(N) : one(N)
            c += y - ŷs[d,tc]
            #if y_code[d]
            #    c+=ln(ŷs[d,tc])
            #end
        end
    end
    c/size(ŷs,2)
end


function feedforward_hierarchical_softmax{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    feedforward(pvdm, para_indexes, window_indexeses, sigmoid)
end

function backprop_hierarchical_softmax{N,S,I}(pvdm::PVDM{N,S}, y_codes::Vector{BitVector}, ŷs::Matrix{N}, xs::Matrix{N} , para_indexes::Vector{I}, window_indexeses::Matrix{I})
    #speed optimistation of `δ_top_s = ŷs.-ys`
    δ_top_s = similar(ŷs)
    for tc in size(ŷs,2)
        const y_code= y_codes[tc]
        for d in length(y_code)
            const y = y_code[d] ? zero(N) : one(N)
            δ_top_s[d,tc] = y - ŷs[d,tc]
            #δ_top_s[d,tc] =  y_code[d] ? one(N)-ŷs[d,tc] : zero(N)
        end
        for d in length(y_code):size(ŷs,1)
            δ_top_s[d,tc]=zero(N)
        end
    end

    backprop(pvdm, δ_top_s, ŷs, xs, para_indexes, window_indexeses)
end



backprop_hierarchical_softmax (generic function with 1 method)

In [14]:
ŷs,xs=feedforward_hierarchical_softmax(pvdm_outer, para_indexes_o, window_indexes_o)
backprop_hierarchical_softmax(pvdm_outer, label_codes_o, ŷs,xs, para_indexes_o, window_indexes_o)

LoadError: LoadError: UndefVarError: pvdm_outer not defined
while loading In[14], in expression starting on line 1

In [15]:
function backprop{N,S,I}(pvdm::PVDM{N,S}, δ_top_s::Matrix{N}, ŷs::Matrix{N}, xs::Matrix{N} , para_indexes::Vector{I}, window_indexeses::Matrix{I})
    const emb_width = size(pvdm.we.L,1)
    const n_training = length(para_indexes)
    const window_len = pvdm.window_length 

    @assert length(n_training)>0
    
    ΔL = zeros(pvdm.we.L) #Word Vector Changes
    ΔD = zeros(pvdm.pe.L) #Paragraph Vector Changes
    
    Δb = sum(δ_top_s,2) |> vec
    ΔW = (δ_top_s * xs')
    δ_input_s= (pvdm.W'*δ_top_s) #the activation function of the layer below dxs=d(1*D[ii];L[iis]) =1
    
    #Paragraph vector Error
    for ii in 1:n_training #Add sequentially, reather than via in a += as that would only allow one add for repreased index
        @inbounds ΔD[:,para_indexes[ii]] += δ_input_s[1:emb_width,ii]
    end
    
    #word vectors
    for ii in 1:n_training
        for ww in 1:window_len
            const offset=ww*emb_width
            @inbounds ΔL[:,window_indexeses[ww,ii]]+=δ_input_s[offset+1:offset+emb_width, ii]
        end
    end
    
    ΔL./n_training, ΔD./n_training, ΔW./n_training, Δb./n_training
end

backprop (generic function with 1 method)

In [16]:
@doc "This assumes the number of works and paragraphs known remains constant" ->
function unpack!(pvdm::PVDM, θ::Vector)
    start=0
    item=pvdm.we.L
    len_total=length(item)
    @inbounds pvdm.we.L = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.pe.L
    len_total+=length(item)
    @inbounds pvdm.pe.L = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.W 
    len_total+=length(item)
    @inbounds pvdm.W = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.b 
    len_total+=length(item)
    @inbounds pvdm.b = @pipe θ[1+start:len_total]
    
    pvdm
end


@doc "This assumes the number of works and paragraphs known remains constant" ->
function pack{N}(L::AbstractMatrix{N}, D::AbstractMatrix{N}, W::AbstractMatrix{N},b::AbstractVector{N})
    vcat(vec(L),vec(D), vec(W),b)
end

@doc "This assumes the number of works and paragraphs known remains constant" ->
function pack(pvdm::PVDM)
    pack(pvdm.we.L, pvdm.pe.L, pvdm.W, pvdm.b)
end


pack (generic function with 2 methods)

In [17]:
using Huffman

In [18]:
const WINDOW_LEN = 8 

@time training = Vector{String}[pad_advanced(para, WINDOW_LEN) for para in training]

#@time word2code_o = get_huffman_codes(training) 

  

6097-element Array{Array{AbstractString,1},1}:
 AbstractString["*START*","being","able","to","change","the","*UNKNOWN*","sizes","is","awesome","!","*END*"]                                                                            
 AbstractString["*START*","for","whatever","reason",",","*UNKNOWN*","decided","to","make","the"  …  "on","the","home","screen","than","on","the","*UNKNOWN*",".","*END*"]               
 AbstractString["*START*","i","found","myself","constantly","changing","the","angle","of","the"  …  "and","down","and","the","distance","away","from","me",".","*END*"]                 
 AbstractString["*START*","i","was","an","avid","reader","but","increasing","age","has"  …  "very","light","weight","has","made","reading","fun","again",".","*END*"]                   
 AbstractString["*START*","what","'s","more",",","it","'s","easy","to","change","*UNKNOWN*","size",".","*END*"]                                                                         
 AbstractString["*START*","t

80.931 milliseconds (188 k allocations: 6608 KB, 26.49% gc time)


In [19]:
using HDF5, JLD
#save("huffmancodes.jld", "word2code", word2code_o)
word2code_o = load("huffmancodes.jld", "word2code")

Dict{AbstractString,BitArray{1}} with 5118 entries:
  "squat"          => Bool[false,true,true,false,true,true,false,false,true,tru…
  "kms"            => Bool[false,true,true,false,true,true,false,true,true,fals…
  "crisp"          => Bool[false,false,false,true,true,false,true,true,false,tr…
  "enjoy"          => Bool[false,true,false,false,false,true,false,false,false,…
  "whoever"        => Bool[false,true,true,true,false,false,false,false,false,t…
  "airbags"        => Bool[false,true,true,true,false,false,true,true,true,true…
  "advertisements" => Bool[false,true,true,false,true,false,true,true,true,fals…
  "chocolate"      => Bool[false,false,true,false,true,true,true,false,true,tru…
  "mangled"        => Bool[false,true,true,true,false,false,true,false,false,tr…
  "everywhere"     => Bool[true,true,true,true,true,false,true,false,false,true…
  "regular"        => Bool[false,false,false,false,true,false,true,true,true,tr…
  "favorites"      => Bool[false,true,true,true,false,tru

In [20]:
@time const max_code_length = @pipe word2code_o |> values |> map(length,_) |> maximum(_)
@printval max_code_length

we_outer = WE(Float32,String, 200)
add_all_words!(we_outer, training, 0.01)
const pvdm_outer = PVDM(we_outer, WINDOW_LEN, 0.01, max_code_length);
we_outer=0

index2code_o(ii) = word2code_o[pvdm_outer.we.indexed_words[ii]]
const training_indexes_o = @pipe ( training
                                    |> map(para -> get_para_training_cases!(pvdm_outer, para),_) 
                                    |> chain(_...)
                                    |> collect
                                    |> hcat_no_splatting)
const para_indexes_o = training_indexes_o[1,:] |> vec
const window_indexes_o = training_indexes_o[2:end-1,:] 
const label_indexes_o = training_indexes_o[end,:] |> vec;


const label_codes_o = BitVector[index2code_o(ii) for ii in label_indexes_o]

  

85666-element Array{BitArray{1},1}:
 Bool[false,true,false,true,true,true]                                                  
 Bool[false,false,true,false,false,false,false,false,false,false,true,false]            
 Bool[false,true,false,false,false,true,true,true,false]                                
 Bool[false,false,false,false,false]                                                    
 Bool[false,false,false,false,true,false,false,false,false,false,false]                 
 Bool[true,false,false,false]                                                           
 Bool[false,false,true,true,false]                                                      
 Bool[true,true,false,false,false,false,false]                                          
 Bool[true,false,false,false]                                                           
 Bool[false,true,false,false,true,true,true,true,true,false,false,true,false]           
 Bool[true,true,false,false,false,false,true,false]                       

83.749 milliseconds (46935 allocations: 2161 KB)
max_code_length = 17


In [21]:
function loss!(θ::Vector)  
    #warn("loss! not defined")
    grad = similar(θ)
    cached_loss_and_loss_grad!(θ, grad)
end

function loss_grad!(θ::Vector, storage::Vector) 
    #warn("loss_grad not defined")
    cached_loss_and_loss_grad!(θ, grad)
end


function loss_and_loss_grad!(θ::Vector, grad::Vector)   
    unpack!(pvdm_outer, θ)
    
    ŷs,xs = feedforward_hierarchical_softmax(pvdm_outer, para_indexes_o, window_indexes_o)
    Δs  = backprop_hierarchical_softmax(pvdm_outer, label_codes_o, ŷs,xs, para_indexes_o, window_indexes_o)
    grad[:] = pack(Δs...)
    loss_hierarchical_softmax(label_codes_o, ŷs)
end



loss_and_loss_grad! (generic function with 1 method)

In [22]:
N = typeof(pvdm_outer.varience)
loss_and_loss_grad_cache = Dict{Vector{N}, Tuple{N,AbstractVector{N}}}()
loss_and_loss_grad_cache_hits = 0
loss_and_loss_grad_cache_misses = 0
function cached_loss_and_loss_grad!(θ::Vector, grad::Vector)
    global loss_and_loss_grad_cache
    global loss_and_loss_grad_cache_hits
    global loss_and_loss_grad_cache_misses
    if haskey(loss_and_loss_grad_cache,θ)
        loss_and_loss_grad_cache_hits+=1
        err, grad[:] = loss_and_loss_grad_cache[θ]
        err
    else
        loss_and_loss_grad_cache_misses+=1
        err = loss_and_loss_grad!(θ, grad)
        loss_and_loss_grad_cache[θ] = (err, grad)
        err
    end
end

cached_loss_and_loss_grad! (generic function with 1 method)

In [None]:
using Optim #https://github.com/JuliaOpt/Optim.jl
f=DifferentiableFunction(loss!,loss_grad!,cached_loss_and_loss_grad!)
θ = pack(pvdm_outer)
#θ=res.minimum
#θ=optx
@time res = optimize(f, θ, method=:cg, show_trace = true, store_trace = true, iterations = 2);
@printval res.f_calls 
@printval res.g_calls 
@printval res.iterations
@printval res.f_minimum
@printval res.f_converged 
@printval res.trace
@printval loss_and_loss_grad_cache_hits
@printval loss_and_loss_grad_cache_misses

In [None]:
)elapsed time: 902.511280671 seconds (173774005532 bytes allocated, 10.39% gc time)
res.f_calls = 18
res.g_calls = 16
res.iterations = 2
res.f_minimum = 6.521084785461426
res.f_converged = false
res.trace = Iter     Function value   Gradient norm 
------   --------------   --------------
     0     8.649927e+00     1.244972e-01
     1     6.932991e+00     7.786864e-02
     2     6.521085e+00     1.384170e-01

loss_and_loss_grad_cache_hits = 9
loss_and_loss_grad_cache_misses = 9