In [2]:
#returns `LL` embedding matrix and onehot lookup `ee`,
#Such that `LL*ee["example"]` returns the word embedding for "example"
function load_embeddings(embedding_file)
    embeddingsDict = Dict{String,Vector{Float64}}()
    #sizehint!(embeddings, 268810)
    for line in eachline(open(embedding_file))
        fields = line |> split
        word = fields[1]
        vec = map(parsefloat, fields[2:end])
        embeddingsDict[word] = vec
    end
    embeddingsDict
    
    LL = hcat(collect(values(embeddingsDict))...)
    ee = [key=>setindex!(BitArray(length(embeddingsDict)), true, ii) 
                for (ii,key) in enumerate(keys(embeddingsDict))]
    indexed_word = embeddingsDict |> keys |> collect
    
    LL,ee, indexed_word
end

load_embeddings (generic function with 1 method)

In [3]:
type PhraseEmbedder
    L ::Matrix{Float64} #Word Embedding Matrix
    e :: Dict{String,BitVector} #Word to its one hot representation
    indexed_words::Vector{String} #Index back to word
    W ::Matrix{Float64} #Word Combination Matrix 
    iW ::Matrix{Float64} #pseudo-inverse Word Combination Matrix
end

function PhraseEmbedder(WW, LL, ee, indexed_words)
    PhraseEmbedder(LL, ee, indexed_words, WW, pinv(WW))
end

PhraseEmbedder (constructor with 3 methods)

In [4]:
LL,ee,indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");
WW = rand(size(LL,1),2*size(LL,1))

phrase_emb = PhraseEmbedder(WW, LL, ee, indexed_words);
size(LL)

(50,268810)

In [17]:
function embed(pe::PhraseEmbedder, word1, word2)
    c1c2 = [pe.L*(pe.e[word1]), pe.L*pe.e[word2]];
    p12 = tanh(pe.W*c1c2);
end

embed (generic function with 1 method)

In [18]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


neighbour_dists (generic function with 1 method)

In [24]:
function resynth(pe::PhraseEmbedder, p12, nbest=5)
    function show_best(candidates)
        best_cands = [ (findfirst(candidates,score), score)
                        for score in select(candidates,1:nbest, rev=true)[1:nbest]]
        vcat([[pe.indexed_words[ii] score] for (ii,score) in best_cands]...)
    end

    
    
    ĉ1 = (pe.iW*atanh(p12)) [1:50]
    ĉ2 = (pe.iW*atanh(p12)) [51:100];
    ê1_candidates=neighbour_dists(ĉ1,pe.L)
    ê2_candidates=neighbour_dists(ĉ2,pe.L)
    [show_best(ê1_candidates) show_best(ê2_candidates)]
end

resynth (generic function with 2 methods)

In [28]:
words = split("that man")
pp = embed(phrase_emb, words...)
resynth(phrase_emb, pp,26)

26x4 Array{Any,2}:
 "that"                  0.74191   "honeymooners"  0.591283
 "much-needed"           0.599668  "headmaster"    0.587335
 "because"               0.586175  "graduation"    0.586953
 "disbursements"         0.562552  "procession"    0.576014
 "cracked"               0.553832  "Ford-Carter"   0.573492
 "what"                  0.551342  "protectorate"  0.566705
 "Africa"                0.539998  "MI-8"          0.558809
 "sick"                  0.533608  "frugality"     0.55845 
 "disparate"             0.52187   "graveyard"     0.558278
 "parity"                0.517798  "fever"         0.554342
 "financially-troubled"  0.515416  "congressman"   0.553383
 "steps"                 0.51237   "dialect"       0.547689
 "Kaztelekom"            0.503821  "footballer"    0.547099
 "concessionary"         0.503011  "prodigy"       0.543899
 "jobless"               0.502015  "detachment"    0.540469
 "no"                    0.497899  "tipple"        0.538872
 "Third"             