In [25]:
using Iterators
using DataStructures
macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

In [26]:
using Pipe
push!(LOAD_PATH, "../word-embeddings2")
using WordEmbeddings
we = @pipe load_word2vec_embeddings("../../Resources/example_code/word2vec/GoogleNews-vectors-negative300.bin", 15000) |> WE(_...);

In [27]:
function load_data(filepath)
    lines = open(filepath) do filehandle
        map(eachline(filehandle)) do line
            fields = split(line)
            (fields[1], fields[2:end])
        end
    end

    data = String[]
    labels = String[]
    
    for (hyper, hypos) in lines
        if has_word(we,hyper) # Skip ones we don't have 
            hypos = @pipe hypos |> filter(w->has_word(we,w), _)
            append!(data,hypos)
            @pipe hyper |> fill(_,length(hypos)) |> append!(labels, _)
        end
    end 
    data, labels
end


load_data (generic function with 1 method)

In [28]:
function load_and_embed(path)
    data_str, labels_str = load_data(path)
    data = eval_word_embeddings(we,data_str)
    labels =  eval_word_embeddings(we,labels_str);
    order = randperm(size(data,2));
    data[:,order], labels[:,order];
end

data, labels = load_and_embed("HyponymGen/hyponym-generation-noun-train.txt")
data_valid, labels_valid = load_and_embed("HyponymGen/hyponym-generation-noun-dev.txt")

(
300x31385 Array{Float64,2}:
 -0.0581674  -0.0671601    -0.00800945  …  -0.027202     0.0867118 
  0.0520085  -0.0491877    -0.0703835      -0.00294688   0.075921  
  0.0708274  -0.0209678    -0.0100844      -0.0652847    0.0566517 
  0.0144563   0.107835     -0.00813395     -0.00516837   0.060891  
 -0.146445   -0.0845019    -0.0531196       0.0182253   -0.00674425
 -0.0448231   0.0253821    -0.0108729   …   0.0725385    0.0196547 
 -0.076302    0.00409897   -0.0252318       0.0819685    0.0570371 
 -0.0831452  -0.124861      0.0275558       0.0160492   -0.117928  
 -0.0605626  -0.00291658    0.0511277      -0.0337304    0.0766918 
  0.0136009   0.0532866     0.0634116       0.0544039    0.0581933 
  0.0455075  -0.0342107     0.0406697   …  -0.0681862    0.0639741 
  0.0210429  -0.038152     -0.0474757       0.0297408   -0.0855557 
  0.0684323  -0.0124546     0.00294648      0.0134196   -0.0263989 
  ⋮                                     ⋱                          
  0.0306234  -0.05

In [29]:
type NN 
    Ws:: Vector{Matrix{Float64}} 
    bs:: Vector{Vector{Float64}} 
end

function NN(layer_sizes::Vector{Int}, var=0.01)
    Ws = [var*randn(layer_sizes[ii], layer_sizes[ii-1]) for ii in 2:length(layer_sizes)]
    bs = [var*randn(layer_sizes[ii]) for ii in 2:length(layer_sizes)]
    NN(Ws, bs)
end

function NN_empty(layer_sizes::Vector{Int})
    Ws = [Array(Float64,(layer_sizes[ii], layer_sizes[ii-1])) for ii in 2:length(layer_sizes)]
    bs = [Array(Float64,layer_sizes[ii]) for ii in 2:length(layer_sizes)]
    NN(Ws, bs)
end


NN_empty (generic function with 1 method)

In [None]:
function feedfoward(nn::NN, xs::AbstractMatrix{Float64})
    as = AbstractMatrix{Float64}[ [NaN]' for _ in 1:length(nn.Ws)+1 ] 
   
    as[1] =  xs
    for ii in 1:length(nn.Ws)
        as[ii+1] = tanh(nn.Ws[ii]*as[ii] .+ nn.bs[ii])
    end 
    as[end], as
end

function backprop(nn::NN, ys::AbstractMatrix{Float64}, as::Vector{Matrix{Float64}}, loss_grad= as[end]-ys)
    function dZ(z)
        1.0-z.^2 
    end
    Δbs = Vector{Float64}[ [NaN] for _ in 1:length(nn.Ws) ] 
    ΔWs = Matrix{Float64}[ [NaN]' for _ in 1:length(nn.Ws) ] 
    ŷs = as[end]
    δ_above = loss_grad.*dZ(ŷs)
    for ii in length(nn.Ws):-1:1
        Δbs[ii] = sum(δ_above,2)[:]
        ΔWs[ii] = (δ_above * as[ii]')
        δ_above = (nn.Ws[ii]'*δ_above) .*dZ(as[ii])
    end
   
    ΔWs,Δbs
end



function loss(ŷs, ys)
    sum(0.5*(ys-ŷs).^2,2) |> sum
end

In [None]:
colnorm(A) = [norm(A[:,ii]) for ii in 1:size(A,2)]
coldot(A,B) = [A[:,ii]⋅B[:,ii] for ii in 1:size(A,2)]

function cosine_sim(ys,ts)
    coldot(ys,ts)./(colnorm(ys).*colnorm(ts))
end

function loss_cosine(ŷs, ys)
    0.5(1.0-cosine_sim(ŷs, ys)).^2 |> mean
end

function loss_diff_cosine(ys, ts)
    df = similar(ys)
    for jj in size(df,2)
        tjs = ts[:,jj]
        yjs = ys[:,jj]
        
        normprod = norm(tjs)*norm(yjs)
        df[:,jj] = tjs./normprod + abs(yjs).*norm(tjs)./(normprod.^3)
    end
    
    df.*(1.0-cosine_sim(ys,ts))'
end

In [31]:
using Mocha

data_l = MemoryDataLayer(name="train-data",tops=[:data,:label], data=Array[data, labels],batch_size=10)
fc1   = InnerProductLayer(name="ip1",output_dim=500,neuron=Neurons.ReLU(),
bottoms=[:data],tops=[:ip1], weight_init=GaussianInitializer(0.0,0.0001))
fc2   = InnerProductLayer(name="ip2",output_dim=300,neuron=Neurons.ReLU(),
                          bottoms=[:ip1], tops=[:ip2],  weight_init=GaussianInitializer(0.0,0.0001))
loss_l = SquareLossLayer(name="loss", bottoms=[:ip2,:label])
backend = CPUBackend()
init(backend)
common_layers = [fc1,fc2]
net = Net("nym-train", backend, [data_l, common_layers..., loss_l])


10-Jun 10:27:30:INFO:root:Constructing net nym-train on CPUBackend...
10-Jun 10:27:30:INFO:root:Topological sorting 4 layers...
10-Jun 10:27:30:INFO:root:Setup layers...
10-Jun 10:27:30:INFO:root:Network constructed!


************************************************************
          NAME: nym-train
       BACKEND: CPUBackend
  ARCHITECTURE: 4 layers
............................................................
 *** MemoryDataLayer(train-data)
    Outputs ---------------------------
          data: Blob(300 x 10)
         label: Blob(300 x 10)
............................................................
 *** InnerProductLayer(ip1)
    Inputs ----------------------------
          data: Blob(300 x 10)
    Outputs ---------------------------
           ip1: Blob(500 x 10)
............................................................
 *** InnerProductLayer(ip2)
    Inputs ----------------------------
           ip1: Blob(500 x 10)
    Outputs ---------------------------
           ip2: Blob(300 x 10)
............................................................
 *** SquareLossLayer(loss)
    Inputs ----------------------------
           ip2: Blob(300 x 10)
         label: Blob(300 x 10)
*************

In [32]:
exp_dir = "snapshots"
params = SolverParameters(max_iter=10000, regu_coef=0.0005,
    mom_policy=MomPolicy.Fixed(0.9),
    lr_policy=LRPolicy.Inv(0.01, 0.0001, 0.75),
    load_from=exp_dir)
solver =  Nesterov(params)

setup_coffee_lounge(solver, save_into="$exp_dir/statistics.jld", every_n_iter=1000)

# report training progress every 100 iterations
add_coffee_break(solver, TrainingSummary(), every_n_iter=100)




1-element Array{CoffeeBreak,1}:
 CoffeeBreak(TrainingSummary(),100,0)

In [33]:
data_valid_l = MemoryDataLayer(name="validation-data",tops=[:data,:label], data=Array[data_valid, labels_valid],batch_size=1000)
accuracy = SquareLossLayer(name="validation-loss",bottoms=[:ip2, :label])
valid_net = Net("nym-valid", backend, [data_valid_l, common_layers..., accuracy])
add_coffee_break(solver, ValidationPerformance(valid_net), every_n_iter=1000)

10-Jun 10:27:34:INFO:root:Constructing net nym-valid on CPUBackend...
10-Jun 10:27:34:INFO:root:Topological sorting 4 layers...
10-Jun 10:27:34:INFO:root:Setup layers...
10-Jun 10:27:34:DEBUG:root:InnerProductLayer(ip1): sharing weights and bias
10-Jun 10:27:34:DEBUG:root:InnerProductLayer(ip2): sharing weights and bias
10-Jun 10:27:34:INFO:root:Network constructed!


2-element Array{CoffeeBreak,1}:
 CoffeeBreak(TrainingSummary(),100,0)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [34]:
solve(solver, net)


10-Jun 10:27:39:DEBUG:root:Checking network topology for back-propagation
10-Jun 10:27:39:DEBUG:root:Init network nym-train
10-Jun 10:27:39:DEBUG:root:Init parameter weight for layer ip1
10-Jun 10:27:39:DEBUG:root:Init parameter bias for layer ip1
10-Jun 10:27:39:DEBUG:root:Init parameter weight for layer ip2
10-Jun 10:27:39:DEBUG:root:Init parameter bias for layer ip2
10-Jun 10:27:39:DEBUG:root:Initializing coffee breaks
10-Jun 10:27:39:INFO:root:Merging existing coffee lounge statistics in snapshots/statistics.jld
10-Jun 10:27:39:DEBUG:root:Init network nym-valid
10-Jun 10:27:39:INFO:root:000000 :: TRAIN obj-val = 0.50000006
10-Jun 10:27:49:INFO:root:
10-Jun 10:27:49:INFO:root:## Performance on Validation Set after 0 iterations
10-Jun 10:27:49:INFO:root:---------------------------------------------------------
10-Jun 10:27:49:INFO:root:  Square-loss (avg over 32000) = 0.5000
10-Jun 10:27:49:INFO:root:---------------------------------------------------------
10-Jun 10:27:49:INFO:root:

2-element Array{Array{Nothing,1},1}:
 [nothing,nothing]
 [nothing,nothing]

In [19]:
destroy(net)
destroy(valid_net)

09-Jun 16:02:18:DEBUG:root:Destroying network nym-train
09-Jun 16:02:18:DEBUG:root:Destroying network nym-valid


In [None]:
function unpack!(nn::NN, θ::Vector)
    endpoint=0
    for Wi in 1:length(nn.Ws)
        startpoint, endpoint =endpoint+1, endpoint+length(nn.Ws[Wi])
#        @printval startpoint
#        @printval endpoint
        
        nn.Ws[Wi][:] = θ[startpoint:endpoint]
    end
    for bi in 1:length(nn.bs)
        startpoint, endpoint =endpoint+1, endpoint+length(nn.bs[bi])
#       @printval startpoint
#       @printval endpoint
        
        nn.bs[bi][:] = θ[startpoint:endpoint]
    end
    nn
end

function pack(nn::NN)
    pack(nn.Ws, nn.bs)
end

function pack(Ws::Vector{Matrix{Float64}}, bs::Vector{Vector{Float64}})
    vcat([W[:] for W in Ws]..., [b[:] for b in bs]...)
end


In [None]:
importall EmpiricalRisks
using ArrayViews

type NNPred <: PredictionModel{1,1}
    layer_sizes::Vector{Int}
end

inputlen(pm::NNPred) = pm.layer_sizes[1]
inputsize(pm::NNPred) = (inputlen(pm),)
outputlen(pm::NNPred) = pm.layer_sizes[end]
outputsize(pm::NNPred) = (outputlen(pm),)
paramlen(pm::NNPred) =  pm.layer_sizes⋅[0,pm.layer_sizes[1:end-1]] + sum(pm.layer_sizes[2:end])
paramsize(pm::NNPred) = (paramlen,)
function ninputs(pm::NNPred, x)
    @assert(size(x,1)==inputlen(pm))
    @assert(ndims(x)<=2)
    ndims==2 ? size(x,2) : 1
end

predict(pm::NNPred, theta::Vector, x::Vector) = predict(pm,theta,x'')
function predict(pm::NNPred, theta::Vector, x::Matrix)
    nn=unpack!(NN_empty(pm.layer_sizes), theta)
    ys,as = feedfoward(nn,x)
    ys
end


function value_and_addgrad!{T<:MultivariateLoss}(rm::SupervisedRiskModel{NNPred, T}, beta, grad, alpha, theta, x, y)
    if ndims(x)==1
        x=x''
        y=y''
    end
    @pz theta
    nn=unpack!(NN_empty(rm.predmodel.layer_sizes), theta)
    u,as = feedfoward(nn, x)
    @pz y
    #u starts as ŷ, and is over written with loss_grad
    v=0.0
    for i = 1:size(x,2)
        u_i = view(u,:,i)
        @pz u_i
        println("***")
        y_i = view(y,:,2)
        @pz y_i
        v_i, _ = value_and_grad!(loss, u_i, u_i, y_i)
        v += v_i
    end
    
    #If things were easy:
    #(loss, loss_grad) = value_and_grad(rm.loss, ŷ, y)
    
    Δs = pack(backprop(nn,y,as,loss_grad)...)
    
    grad[:].*=beta
    grad[:].+=alpha.*Δs
    
    loss
end



In [None]:
using SGDOptim
training_seq = minibatch_seq(data, labels,50)


pm = NNPred([300,300, 300])
rm = riskmodel(pm, SumSqrLoss())

nn_init = NN_empty(pm.layer_sizes)
nn_init.Ws[1] = eye(300)
nn_init.Ws[2] = eye(300)
nn_init.bs[1]*=0.0
nn_init.bs[2]*=0.0
θ=pack(nn_init)
θ2=sgd_model = sgd(rm, θ, training_seq)

In [None]:
value(rm, θ2, data, labels)/size(data,2)

In [None]:
ŷ=predict(pm, θ2, data)

In [None]:
xs = data
ys = labels
nn_outer = NN([size(xs,1), 1000, size(ys,1)])
#nn_outer = NN([size(xs,1), 300, 300, size(ys,1)])
#nn_outer.Ws[1]=W1
#nn_outer.Ws[2]=W2


function loss_and_loss_grad!(θ::Vector, grad::Vector)   
    unpack!(nn_outer, θ)
    ŷs,as = feedfoward(nn_outer, xs)
    
    grad[:] = pack(backprop(nn_outer,ys,as)...)
    loss(ŷs,ys)
end

function loss!(θ::Vector)  
    error("loss! not defined")
end

function loss_grad!(θ::Vector, storage::Vector) 
    error("loss_grad not defined")
end



#---------------------
loss_and_loss_grad_cache = Dict{Vector{Float64},(Float64, Vector{Float64})}()
loss_and_loss_grad_cache_hits = 0
loss_and_loss_grad_cache_misses = 0
function cached_loss_and_loss_grad!(θ::Vector, grad::Vector)
    global loss_and_loss_grad_cache
    global loss_and_loss_grad_cache_hits
    global loss_and_loss_grad_cache_misses
    if haskey(loss_and_loss_grad_cache,θ)
        loss_and_loss_grad_cache_hits+=1
        err, grad[:] = loss_and_loss_grad_cache[θ]
        err
    else
        loss_and_loss_grad_cache_misses+=1
        err = loss_and_loss_grad!(θ, grad)
        loss_and_loss_grad_cache[θ] = (err, grad)
        err
    end
end


In [None]:
using Optim #https://github.com/JuliaOpt/Optim.jl
f=DifferentiableFunction(loss!,loss_grad!,cached_loss_and_loss_grad!)
#θ = pack(nn_outer)
θ=res.minimum
res = optimize(f, θ, method=:l_bfgs, show_trace = true, store_trace = true, iterations = 1000);
@printval res.f_calls 
@printval res.g_calls 
@printval res.f_minimum
@printval res.gr_converged
@printval res.iterations
@printval res.x_converged 

@printval res.trace
@printval loss_and_loss_grad_cache_hits
@printval loss_and_loss_grad_cache_misses

In [None]:
using NLopt

f_call_count = 0
function tracking_loss_and_loss_grad!(θ::Vector, grad::Vector)
    global f_call_count
    f_call_count+=1
    f_val = cached_loss_and_loss_grad!(θ, grad)   
    println(f_call_count, '\t',f_val,'\t',norm(grad))
    f_val
end
#:LD_MMA, :LD_CCSAQ, :LD_LBFGS, :LD_SLSQP, :LD_VAR2, :LD_VAR1, :LD_TNEWTON_RESTART
opt = Opt(:LD_LBFGS, length(pack(nn_outer)))

#ftol_abs!(opt,1e-9)
maxtime!(opt, 60*60*8)
min_objective!(opt, tracking_loss_and_loss_grad!)

θ = pack(nn_outer)


(optf,optx,ret) = optimize!(opt,θ)


In [None]:
unpack!(nn_outer, optx);

In [None]:
subset =  100:120
xos = data[:,subset]
yos = labels[:,subset]

#ŷos,_ = feedfoward(nn_outer,xos);

In [None]:
ŷos=ŷ[:,subset]
loss(ŷ,labels)

In [None]:
using Distances

import WordEmbeddings.show_bests
import WordEmbeddings.show_best
import WordEmbeddings.neighbour_sims

function neighbour_sims(cc::Vector{Float64}, globe::Matrix{Float64}, similarity=cosine_sim)
    [similarity(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


function show_best(embedder,ĉ::Embedding, nbest=20, similarity=cosine_sim )
    candidates=neighbour_sims(ĉ,embedder.L, similarity)   
    best_cands = [ (findfirst(candidates,score), score)
                    for score in select(candidates,1:nbest, rev=true)[1:nbest]]
    vcat([[embedder.indexed_words[ii] round(score,2)] for (ii,score) in best_cands]...)
end

function show_bests(embedder,ĉs::Embeddings, nbest=20, similarity=cosine_sim)
    hcat([show_best(embedder,ĉs[:,ii],nbest, similarity) for ii in 1:size(ĉs,2)]...)
end



In [None]:
show_bests(we, ŷos, 1)[1,1:2:end] #(x,y)->-1.0*Distances.euclidean(x,y))

In [None]:
show_bests(we, yos, 3)# (x,y)->-1.0*Distances.euclidean(x,y))