In [2]:
using Compat
using Docile
using Iterators
using Pipe
using Devectorize

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

In [3]:
function unzip(xs)
    [zip(xs...)...]
end

unzip (generic function with 1 method)

In [4]:
push!(LOAD_PATH, map(x->"../"*x, filter(fn-> !(contains(fn,".")),readdir("..")))...)


11-element Array{Union(ASCIIString,UTF8String),1}:
 "/root/buildFromSource/julia/usr/local/share/julia/site/v0.3"
 "/root/buildFromSource/julia/usr/share/julia/site/v0.3"      
 "../Corpus"                                                  
 "../doc2vec"                                                 
 "../Models"                                                  
 "../Optimisation"                                            
 "../recursive_embeddings"                                    
 "../summaristation"                                          
 "../tools"                                                   
 "../util"                                                    
 "../word-embedding3"                                         

In [5]:
using WordEmbeddings

In [6]:
training = open("../Corpus/serialised/opinosis_train_dev_plain.jsz","r") do fs
    deserialize(fs)
end
@pz training

training		Array{Array{String,1},1}	(6097,)


In [7]:
training=training[1:2]

2-element Array{Array{String,1},1}:
 String["being","able","to","change","the","*UNKNOWN*","sizes","is","awesome","!"]                                                                     
 String["for","whatever","reason",",","*UNKNOWN*","decided","to","make","the","*UNKNOWN*","on","the","home","screen","than","on","the","*UNKNOWN*","."]

In [8]:

function pad{S<:String}(sent::Vector{S}, padded_length, pad_word="*STARTPAD*")
    if length(sent) <= padded_length
        ret =  fill(pad_word,padded_length)
        ret[end-length(sent)+1:end] = sent
        ret
    else
        sent
    end
end

pad (generic function with 2 methods)

In [9]:

import WordEmbeddings.WE
function WE(N::DataType,S::DataType, embedding_width::Int)
    L=Array(N,(embedding_width,0))
    word_index=Dict{S,Int}()
    indexed_words=S[]
    WE(L,word_index,indexed_words)
end

function WE_light{N,S}(we::WE{N,S}, N2=N::DataType)
    L=convert(Matrix{N2},we.L)
    word_index=Dict{S,Int}()
    indexed_words=S[]
    WE(L,word_index,indexed_words)
end

function WE_light{N,S}(we::WE{N,S})
    L=we.L
    word_index=Dict{S,Int}()
    indexed_words=S[]
    WE(L,word_index,indexed_words)
end



@doc "Gets the word index, or creates one if it doesn't already exist" ->
function get_word_index!{N,S, S2}(we::WE{N,S}, word::S2, word_varience = 0.01)
    if (word in keys(we.word_index))
        we.word_index[word]
    else
        index = length(we.indexed_words)+1
        we.word_index[word]=index
        push!(we.indexed_words,word)
        
        embedding = convert(Vector{N},word_varience.*randn(size(we.L,1)))
        we.L = hcat(we.L,embedding)
        index
    end
end

function add_all_words!{N,S}(we::WE{N,S}, words::Vector{S}, word_varience=0.01)
    for word in words
        get_word_index!(we, word, word_varience)
    end
    we
end
function add_all_words!{N,S}(we::WE{N,S}, paras::Vector{Vector{S}}, word_varience=0.01)
    for para in paras
        add_all_words!(we, para, word_varience)
    end
    we
end

add_all_words! (generic function with 4 methods)

In [10]:
type PVDM{N<:Number, S<:AbstractString}
    we::WE
    pe::WE #use a word embedder for Paragraphs too
    
    W::AbstractMatrix{N}
    b::AbstractVector{N}

    window_length::Int
    varience::N
end

function PVDM{N,S}(we::WE{N,S}, window_length::Int, varience=0.001)
    
    emb_width,n_words = size(we.L)
    concat_layer_width = emb_width*(window_length+1)
    const W = convert(Matrix{N}, varience*randn(n_words,concat_layer_width))
    const b = convert(Vector{N}, varience*randn(n_words))
    
    pe = WE(N,Vector{S},emb_width)
    
    PVDM{N,S}( we, pe, W,b, window_length, varience)
end

@doc "Lightwieght version, that does not have support for lookups or additions" ->
function PVDM_light{N,S}(pvdm::PVDM{N,S})
    we = WE_light(pvdm.we)
    pe = WE_light(pvdm.pe)
    W = pvdm.W
    b = pvdm.b
    window_length = pvdm.window_length
    varience = nan(N)
    PVDM{N,S}( we, pe, W,b, window_length, varience)
end

@doc "Lightwieght version, that does not have support for lookups or additions, and has N converted type N2" ->
function PVDM_light{N,S}(pvdm::PVDM{N,S}, N2::DataType)
    we = WE_light(pvdm.we,N2)
    pe = WE_light(pvdm.pe, N2)
    W = convert(Matrix{N2},pvdm.W)
    b = convert(Vector{N2},pvdm.b)
    window_length = pvdm.window_length
    varience = convert(N2,NaN)
    PVDM{N2,S}( we, pe, W,b, window_length, varience)
end
    
    

PVDM_light (generic function with 2 methods)

In [11]:
@doc """gets the training cases as vector of (paraIndex, [word_indexes], label_word_index),
cycling by the window length.
Adds the paragraph if it does not already have an index
""" ->
function get_para_training_cases!{S<:String}(pvdm::PVDM, para::Vector{S})
    para_ind = get_word_index!(pvdm.pe, para)
    
    Task() do 
        @assert length(para)>=pvdm.window_length+1
        for offset in 0:length(para)-(pvdm.window_length+1)
            window_iis = [1:pvdm.window_length;]+offset
            label_ii = pvdm.window_length+1+offset
            
            window_words = para[window_iis]
            label_word = para[label_ii]
                        
            windows_indexes = map(word->get_word_index(pvdm.we, word), window_words)
            label_index = get_word_index(pvdm.we, label_word)
            
            produce(Int64[para_ind, windows_indexes..., label_index])
        end
    end
    
end

get_para_training_cases! (generic function with 1 method)

In [12]:
function get_input_layer(pvdm::PVDM, para_index::Int, window_indexes::Vector{Int})
    @inbounds [pvdm.pe.L[:,para_index], vec(pvdm.we.L[:,window_indexes])]
end 

function get_input_layers{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    const emb_width = size(pvdm.we.L,1)
    const n_training = length(para_indexes)
    
    xs = Array(N,(emb_width * (pvdm.window_length+1),n_training))
    @inbounds xs[1:emb_width,:] = pvdm.pe.L[:,para_indexes]
    for training_case in 1:n_training
        @inbounds const window_indexes = window_indexeses[:,training_case]
        @inbounds xs[emb_width+1:end,training_case] = vec(pvdm.we.L[:,window_indexes])
    end
    xs
end

get_input_layers (generic function with 1 method)

In [13]:
function softmax(zs)
    (1./sum(exp(zs),1)).*exp(zs)
end

function feedforward{N,S, I<:Int}(pvdm::PVDM{N,S}, para_indexes::Vector{I}, window_indexeses::Matrix{I})
    xs = get_input_layers(pvdm, para_indexes, window_indexeses)
    
    
    #Speed optimised version of `zs = pvdm.W*xs .+ pvdm.b`
    zs = pvdm.W*xs 
    const n_training = length(para_indexes)
    for ii in 1:n_training
        @inbounds zs[:,ii]+= pvdm.b
    end
    ŷs = softmax(zs)
    ŷs, xs
end

feedforward (generic function with 1 method)

In [53]:
function loss{I,N}(y_indexes::Vector{I}, ŷs::Matrix{N})
    #C=−∑j yj*log ŷj,
    c = zero(N)
    for tc in 1:length(y_indexes)
        @inbounds c-=log(ŷs[y_indexes[tc],tc])
    end
    c/length(y_indexes)
end

loss (generic function with 1 method)

In [54]:
function backprop{N,S,I}(pvdm::PVDM{N,S}, y_indexes::Vector{I}, ŷs::Matrix{N}, xs::Matrix{N} , para_indexes::Vector{I}, window_indexeses::Matrix{I} )
    const emb_width = size(pvdm.we.L,1)
    const n_training = length(para_indexes)
    const window_len = pvdm.window_length 
    #Δb = zeros(pvdm.b)
    #ΔW = zeros(pvdm.W)
    ΔL = zeros(pvdm.we.L) #Word Vector Changes
    ΔD = zeros(pvdm.pe.L) #Paragraph Vector Changes
    
        
    #speed optimistation of `δ_top_s = ŷs.-ys`
    δ_top_s = copy(ŷs)
    for tc in 1:length(y_indexes)
        @inbounds δ_top_s[y_indexes[tc],tc]-=one(typeof(ŷs[1]))
    end

    Δb = sum(δ_top_s,2) |> vec
    ΔW = (δ_top_s * xs')
    δ_input_s= (pvdm.W'*δ_top_s) #the activation function of the layer below dxs=d(1*D[ii];L[iis]) =1
    
    #Paragraph vector Error
    for ii in 1:n_training #Add sequentially, reather than via in a += as that would only allow one add for repreased index
        @inbounds ΔD[:,para_indexes[ii]] += δ_input_s[1:emb_width,ii]
    end
    
    #word vectors
    for ii in 1:n_training
        for ww in 1:window_len
            const offset=ww*emb_width
            @inbounds ΔL[:,window_indexeses[ww,ii]]+=δ_input_s[offset+1:offset+emb_width, ii]
        end
    end
    
   
    ΔL./n_training, ΔD./n_training, ΔW./n_training, Δb./n_training
end

backprop (generic function with 1 method)

In [55]:
@doc "This assumes the number of works and paragraphs known remains constant" ->
function unpack!(pvdm::PVDM, θ::Vector)
    start=0
    item=pvdm.we.L
    len_total=length(item)
    @inbounds pvdm.we.L = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.pe.L
    len_total+=length(item)
    @inbounds pvdm.pe.L = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.W 
    len_total+=length(item)
    @inbounds pvdm.W = @pipe θ[1+start:len_total]|>reshape(_,size(item)...)
    
    start+=length(item)
    item=pvdm.b 
    len_total+=length(item)
    @inbounds pvdm.b = @pipe θ[1+start:len_total]
    
    pvdm
end


@doc "This assumes the number of works and paragraphs known remains constant" ->
function pack{N}(L::AbstractMatrix{N}, D::AbstractMatrix{N}, W::AbstractMatrix{N},b::AbstractVector{N})
    vcat(vec(L),vec(D), vec(W),b)
end

@doc "This assumes the number of works and paragraphs known remains constant" ->
function pack(pvdm::PVDM)
    pack(pvdm.we.L, pvdm.pe.L, pvdm.W, pvdm.b)
end


pack (generic function with 2 methods)

In [56]:
const WINDOW_LEN = 8 
training = Vector{String}[pad(para, WINDOW_LEN+1) for para in training]

we_outer = WE(Float32,String, 200)
add_all_words!(we_outer, training)
pvdm_outer = PVDM(we_outer, WINDOW_LEN);
we_outer=0


training_indexes = @pipe chain(map(para -> get_para_training_cases!(pvdm_outer, para), training)...) |> hcat(_...)
para_indexes_o = training_indexes[1,:] |> vec
window_indexes_o = training_indexes[2:end-1,:] 
label_indexes_o = training_indexes[end,:] |> vec;

ŷs,xs = feedforward(pvdm_outer, para_indexes_o, window_indexes_o)
ΔL, ΔD, ΔW, Δb  = backprop(pvdm_outer, label_indexes_o, ŷs,xs, para_indexes_o, window_indexes_o)

ag = pack(ΔL, ΔD, ΔW, Δb)

42421-element Array{Float32,1}:
 -0.000109332
 -4.84561e-5 
 -9.56933e-6 
  5.8016e-5  
  1.8734e-5  
 -7.14103e-6 
  5.63339e-5 
  4.13632e-7 
 -4.80776e-5 
  6.11413e-5 
 -5.46602e-5 
 -3.38046e-5 
  9.00398e-5 
  ⋮          
 -0.0293528  
  0.0476082  
  0.0475605  
  0.0476708  
  0.0475885  
  0.0476158  
  0.0475833  
 -0.106188   
 -0.0293897  
 -0.0292676  
 -0.0292313  
 -0.0293492  

In [57]:
using DualNumbers
using ForwardDiff

function f(θ)
    pvdm_inner = PVDM_light(pvdm_outer,Dual{typeof(pvdm_outer.varience)})
    unpack!(pvdm_inner,θ)
    ŷs = feedforward(pvdm_inner, para_indexes_o, window_indexes_o)[1]
    loss(label_indexes_o, ŷs)
end

# Using forwarddiff_jacobian
g = forwarddiff_gradient(f, typeof(pvdm_outer.varience), fadtype=:dual, n=length(pack(pvdm_outer)))


g (generic function with 1 method)

In [58]:
θ = pack(pvdm_outer)
dg = g(θ)

42421-element Array{Float32,1}:
 -0.000109332
 -4.84561e-5 
 -9.56933e-6 
  5.8016e-5  
  1.8734e-5  
 -7.14103e-6 
  5.63339e-5 
  4.13633e-7 
 -4.80776e-5 
  6.11413e-5 
 -5.46602e-5 
 -3.38046e-5 
  9.00399e-5 
  ⋮          
 -0.0293527  
  0.0476082  
  0.0475605  
  0.0476708  
  0.0475885  
  0.0476158  
  0.0475833  
 -0.106188   
 -0.0293897  
 -0.0292676  
 -0.0292313  
 -0.0293492  

In [None]:
[dg ag abs(dg-ag) dg./ag] #[1:length(pvdm_outer.we.L), :]

42421x4 Array{Float32,2}:
 -0.000109332  -0.000109332  2.18279e-11  1.0
 -4.84561e-5   -4.84561e-5   3.63798e-12  1.0
 -9.56933e-6   -9.56933e-6   1.81899e-12  1.0
  5.8016e-5     5.8016e-5    1.09139e-11  1.0
  1.8734e-5     1.8734e-5    5.45697e-12  1.0
 -7.14103e-6   -7.14103e-6   1.36424e-12  1.0
  5.63339e-5    5.63339e-5   7.27596e-12  1.0
  4.13633e-7    4.13632e-7   5.40012e-13  1.0
 -4.80776e-5   -4.80776e-5   3.63798e-12  1.0
  6.11413e-5    6.11413e-5   0.0          1.0
 -5.46602e-5   -5.46602e-5   7.27596e-12  1.0
 -3.38046e-5   -3.38046e-5   0.0          1.0
  9.00399e-5    9.00398e-5   1.45519e-11  1.0
  ⋮                                          
 -0.0293527    -0.0293528    5.58794e-9   1.0
  0.0476082     0.0476082    7.45058e-9   1.0
  0.0475605     0.0475605    0.0          1.0
  0.0476708     0.0476708    0.0          1.0
  0.0475885     0.0475885    0.0          1.0
  0.0476158     0.0476158    3.72529e-9   1.0
  0.0475833     0.0475833    0.0          1.0
 -0.1061

In [36]:
countnz(dg./ag.==1)

0

In [51]:
length(pvdm_outer.b)

21