In [1]:
#returns `LL` embedding matrix and onehot lookup `ee`,
#Such that `LL*ee["example"]` returns the word embedding for "example"
function load_embeddings(embedding_file)
    embeddingsDict = Dict{String,Vector{Float64}}()
    #sizehint!(embeddings, 268810)
    for line in eachline(open(embedding_file))
        fields = line |> split
        word = fields[1]
        vec = map(parsefloat, fields[2:end])
        embeddingsDict[word] = vec
    end
    embeddingsDict
    
    LL = hcat(collect(values(embeddingsDict))...)
    ee = [key=>setindex!(BitArray(length(embeddingsDict)), true, ii) 
                for (ii,key) in enumerate(keys(embeddingsDict))]
    indexed_word = embeddingsDict |> keys |> collect
    
    LL,ee, indexed_word
end

load_embeddings (generic function with 1 method)

In [2]:
type PhraseEmbedder
    L ::Matrix{Float64} #Word Embedding Matrix
    e :: Dict{String,BitVector} #Word to its one hot representation
    indexed_words::Vector{String} #Index back to word
    W ::Matrix{Float64} #Word Combination Matrix 
    iW ::Matrix{Float64} #pseudo-inverse Word Combination Matrix
end

function PhraseEmbedder(WW, LL, ee, indexed_words)
    PhraseEmbedder(LL, ee, indexed_words, WW, pinv(WW))
end

PhraseEmbedder (constructor with 3 methods)

In [3]:
LL,ee,indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
WW = rand(size(LL,1),2*size(LL,1))

phrase_emb = PhraseEmbedder(WW, LL, ee, indexed_words);
size(LL)

(50,268810)

In [4]:
function embed(pe::PhraseEmbedder, word1, word2)
    c1c2 = [pe.L*(pe.e[word1]), pe.L*pe.e[word2]];
    p12 = tanh(pe.W*c1c2);
end

embed (generic function with 1 method)

In [5]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


neighbour_dists (generic function with 1 method)

In [28]:
function resynth(pe::PhraseEmbedder, p12, nbest=5)
    function show_best(candidates)
        best_cands = [ (findfirst(candidates,score), score)
                        for score in select(candidates,1:nbest, rev=true)[1:nbest]]
        vcat([[pe.indexed_words[ii] score] for (ii,score) in best_cands]...)
    end

    
    
    ĉ1 = (pe.iW*atanh(p12)) [1:50]
    ĉ2 = (pe.iW*atanh(p12)) [51:100];
    ê1_candidates=neighbour_dists(ĉ1,pe.L)
    ê2_candidates=neighbour_dists(ĉ2,pe.L)
    ê1=indmax(ê1_candidates)
    ê2=indmax(ê2_candidates)
    [show_best(ê1_candidates) show_best(ê2_candidates)]
end

resynth (generic function with 2 methods)

In [32]:
words = split("king queen")
pp = embed(phrase_emb, words...)
resynth(phrase_emb, pp,20)

20x4 Array{Any,2}:
 "king"                   0.781599  "queen"          0.710362
 "workaholic"             0.662043  "warrior"        0.630731
 "80-year-old"            0.657657  "king"           0.622157
 "pontiff"                0.632303  "bouncer"        0.619687
 "swimmer"                0.627186  "youngest"       0.61015 
 "premier"                0.623988  "interpreter"    0.601928
 "contours"               0.611147  "driver"         0.599689
 "stag"                   0.604198  "peerage"        0.597686
 "assassin"               0.596773  "duchess"        0.591162
 "goalkicker"             0.592958  "winner"         0.590089
 "youngest"               0.592486  "Quebecois"      0.576958
 "88-year-old"            0.588176  "minister"       0.574704
 "49-year-old"            0.586584  "prodigy"        0.570894
 "Republican-controlled"  0.585768  "Skipper"        0.570151
 "24-year-old"            0.584658  "fifth-ranking"  0.569403
 "pulpit"                 0.582692  "swimmer"      

In [24]:
aa = [0 2 4 6 8 4]

findfirst(aa, 4)

3

In [22]:
apropos("find")

Base.findin(a, b)
Base.findfirst(A)
Base.findfirst(A, v)
Base.findfirst(predicate, A)
Base.findnz(A)
Base.find_library(names, locations)
Base.findnext(A, i)
Base.findnext(predicate, A, i)
Base.findnext(A, v, i)
Base.cholfact(A, [LU,][pivot=false,][tol=-1.0]) -> Cholesky
Base.cholfact(A[, ll]) -> CholmodFactor
Base.middle(x)
Base.middle(x, y)
Base.middle(range)
Base.middle(array)
Base.findn(A)
Base.findmax(itr) -> (x, index)
Base.findmax(A, dims) -> (maxval, index)
Base.eigs(A[, B], ; nev=6, which="LM", tol=0.0, maxiter=1000, sigma=nothing, ritzvec=true, v0=zeros((0, ))) -> (d[, v], nconv, niter, nmult, resid)
Base.find(A)
Base.find(f, A)
Base.findmin(itr) -> (x, index)
Base.findmin(A, dims) -> (minval, index)
