In [94]:
include("load_embeddings.jl")

load_embeddings (generic function with 1 method)

In [3]:
type PhraseEmbedder
    L ::Matrix{Float64} #Word Embedding Matrix
    e :: Dict{String,BitVector} #Word to its one hot representation
    indexed_words::Vector{String} #Index back to word
    W ::Matrix{Float64} #Word Combination Matrix 
    iW ::Matrix{Float64} #pseudo-inverse Word Combination Matrix
end

function PhraseEmbedder(WW, LL, ee, indexed_words)
    PhraseEmbedder(LL, ee, indexed_words, WW, pinv(WW))
end

PhraseEmbedder (constructor with 3 methods)

In [4]:
LL,ee,indexed_words =  load_embeddings("embeddings-scaled.EMBEDDING_SIZE=50.txt");


(50,268810)

In [66]:
WW = 0.1*randn(size(LL,1),2*size(LL,1))

phrase_emb = PhraseEmbedder(WW, LL, ee, indexed_words);
size(LL)

(50,268810)

In [67]:
function embed(pe::PhraseEmbedder, word1, word2)
    c1c2 = [pe.L*(pe.e[word1]), pe.L*pe.e[word2]];
    p12 = tanh(pe.W*c1c2);
end

embed (generic function with 1 method)

In [68]:
function cosine_dist(a,b)
    (a⋅b)/(norm(a)*norm(b))
end

function neighbour_dists(cc::Vector{Float64}, globe::Matrix{Float64})
    [cosine_dist(cc, globe[:,ii]) for ii in 1:size(globe,2)]
end


neighbour_dists (generic function with 1 method)

In [81]:
function show_best(pe::PhraseEmbedder, candidates, nbest=20)
        best_cands = [ (findfirst(candidates,score), score)
                        for score in select(candidates,1:nbest, rev=true)[1:nbest]]
        vcat([[pe.indexed_words[ii] score] for (ii,score) in best_cands]...)
    end

function resynth(pe::PhraseEmbedder, p12, nbest=5)  
    ĉ1 = (pe.iW*atanh(p12)) [1:50]
    ĉ2 = (pe.iW*atanh(p12)) [51:100];
    ê1_candidates=neighbour_dists(ĉ1,pe.L)
    ê2_candidates=neighbour_dists(ĉ2,pe.L)
    [show_best(pe, ê1_candidates,nbest) show_best(pe, ê2_candidates,nbest)]
end

resynth (generic function with 2 methods)

In [82]:
words = split("foolish man")
pp = embed(phrase_emb, words...)
resynth(phrase_emb, pp,26)

26x4 Array{Any,2}:
 "evaporating"       0.661644  "man"          0.730985
 "mistyped"          0.65391   "insistence"   0.671059
 "impinging"         0.640174  "referees"     0.658096
 "mutated"           0.636294  "artist"       0.648675
 "meaningless."      0.635216  "aggression"   0.64698 
 "interacting"       0.61948   "species"      0.645078
 "bending"           0.612059  "Pope"         0.64331 
 "thumbing"          0.610788  "youth"        0.643148
 "bunching"          0.60364   "referee"      0.632438
 "stockpicking"      0.603497  "driver"       0.62801 
 "well-aimed"        0.602395  "Democrat"     0.623901
 "sleeplessness"     0.598432  "weapon"       0.623435
 "converging"        0.5977    "flag"         0.619023
 "non-conforming"    0.59676   "gunboat"      0.618838
 "well-capitalised"  0.59649   "defences"     0.617618
 "underrated"        0.594866  "kids"         0.616917
 "challenging."      0.592742  "orientation"  0.615816
 "respectful"        0.590245  "settler"      

In [71]:
function embed_12_3(pe::PhraseEmbedder, word1, word2, word3)
 
end

embed_12_3 (generic function with 1 method)

In [92]:
pe = phrase_emb
word1 = "a"
word2 = "an"
word3 = "the"




"the"

In [93]:
p12 = embed(pe, word1, word2)
c3 = pe.L*pe.e[word3]
p12_3 = tanh(pe.W*[p12; c3])

ĉ12 = (pe.iW*atanh(p12_3)) [1:50]
ĉ3 = (pe.iW*atanh(p12_3)) [51:100];

ĉ1 = (pe.iW*atanh(ĉ12)) [1:50]
ĉ2 = (pe.iW*atanh(ĉ12)) [51:100];


ê1_candidates=neighbour_dists(ĉ1,pe.L)
ê2_candidates=neighbour_dists(ĉ2,pe.L)
ê3_candidates=neighbour_dists(ĉ3,pe.L)

[show_best(pe, ê1_candidates) show_best(pe, ê2_candidates) show_best(pe, ê3_candidates)]

20x6 Array{Any,2}:
 "slided"          0.541224  "outlawed"         …  "the"             0.689449
 "hire-purchase"   0.525807  "ACT"                 "The"             0.598609
 "participating."  0.515988  "Moslem-majority"     "Basel"           0.574183
 "Emblaze"         0.513615  "continued."          "Uribe"           0.548154
 "unrivaled"       0.512558  "pro-British"         "Basque"          0.544355
 "melded"          0.512258  "strikes."         …  "Constitutional"  0.532194
 "Abbreviated"     0.507599  "loyalist"            "undefeated"      0.527619
 "unratified"      0.506303  "Favoured"            "next"            0.525303
 "Britrain"        0.505244  "Basque"              "another"         0.522995
 "unutilised"      0.494179  "Fatherland"          "shuttle"         0.510541
 "civilisational"  0.490582  "insists"          …  "at"              0.503717
 "antidilutive"    0.488746  "PLA"                 "Zulu"            0.500902
 "recapitalized"   0.487065  "Flemish"       

In [64]:
help(atanh)

Base.atanh(x)

   Compute the inverse hyperbolic tangent of "x"
