In [5]:
using WordEmbeddings, SoftmaxClassifier
using ProgressMeter
using Utils
using Query
using WordStreams
using Distances
using StatsBase
using Training
using AdaGramCompat
using CorpusLoaders.Similarity

In [6]:
SwiftObjectStores.list(SwiftService(), "sensemodels")

Dict{Pair{Any,Any},Pair{Any,Any}} with 1 entry:
  Pair{Any,Any}("action","list_container_part") => Pair{Any,Any}("marker","")

In [7]:
function all_word_sense_vectors(ee::WordEmbedding, word)
    if haskey(ee.embedding, word)
        [ee.embedding[word]]
    else
        Vector{Float32}[]
    end
end


function all_word_sense_vectors(ee::WordSenseEmbedding, word)
    get(ee.embedding, word, Vector{Float32}[])
end

function all_word_sense_vectors(am::AdaGramCompat.AdaGramModel, word)
    if haskey(am.dict.word2id, word)
        wsv_mat = word_sense_vectors(am, word)
        [view(wsv_mat,:,ii) for ii in 1:size(wsv_mat,2)]
    else
        Vector{Float32}[]
    end
end

all_word_sense_vectors (generic function with 3 methods)

In [8]:
using JLD
#ee = load("models/ss/tokenised_lowercase_WestburyLab.wikicorp.201004_300_i1.jld","ee");
#eep = get_jld(SwiftService(), "sensemodels/plain/", "tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.jld", "ee");
ee = get_jld(SwiftService(), "sensemodels", "plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.semhuff.jld", "ee");

In [None]:
using AdaGram
using AdaGramCompat
am = load("models/adagram/more_senses.adagram_model.jld", "am");

In [None]:
s_am = get_jld(SwiftService(), "sensemodels/adagram/", "semhuff_more_senses.adagram_model.jld", "am");

In [11]:
function get_sims(ee, wordpairs::Matrix{String})
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try
            wvs_1::Matrix{Float32} = hcat(all_word_sense_vectors(ee,wordpairs[ii,1])...)
            wvs_2::Matrix{Float32} = hcat(all_word_sense_vectors(ee,wordpairs[ii,2])...)
            sims[ii]=1-minimum(pairwise(CosineDist(), wvs_1,wvs_2))
        catch ex
            warn(ex)
            sims[ii]=NaN
        end
    end
    return sims
end


get_sims (generic function with 1 method)

In [12]:
wordpairs, groundsim = load_wordsim353("./data/corpora/wordsim353/combined.csv");
wordpairs=lowercase.(wordpairs)

@show corspearman(groundsim,get_sims(ee,wordpairs))
@show corspearman(groundsim,get_sims(am,wordpairs))
@show corspearman(groundsim,get_sims(s_am,wordpairs));

corspearman(groundsim,get_sims(ee,wordpairs)) = 0.5937382029721738


LoadError: LoadError: UndefVarError: am not defined
while loading In[12], in expression starting on line 218

 - corspearman(groundsim,get_sims(ee,wordpairs)) = 0.5937382029721738, plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.semhuff.jld




In [13]:
function window_context(index::Integer, context::Vector, window_size::Integer)
    window_lower_bound = max(index - window_size÷2, 1)
    window_upper_bound = min(index + window_size÷2 -1 , length(context))
    view(context, window_lower_bound:window_upper_bound)
end
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), 2) == ["4", "6"]
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), 4) == ["3" , "4", "6", "7"]
Base.Test.@test window_context(2, split("1 3 4 5"), 4) == ["1","3","4"]
Base.Test.@test window_context(4, split("1 2 3 5"), 4) == ["2","3","5"]
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), typemax(Int)) == split("1 2 3 4 6 7 8")

Test Passed
  Expression: window_context(5,split("1 2 3 4 6 7 8"),typemax(Int)) == split("1 2 3 4 6 7 8")
   Evaluated: SubString{String}["1","2","3","4","6","7","8"] == SubString{String}["1","2","3","4","6","7","8"]

In [14]:
(scws_wordpairs, scws_groundsim, scws_contexts, scws_indexes) = CorpusLoaders.Similarity.load_scws("./data/corpora/SCWS/ratings.txt");
scws_windowed_contexts = ((iis, cont) -> window_context(iis, cont, 5)).(scws_indexes, scws_contexts);

In [15]:
function get_sims(ee, wordpairs::Matrix{String}, contexts::Matrix)
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try 
            sense1 = WSD(ee, wordpairs[ii,1],contexts[ii,1];skip_oov=true)
            wv1 = ee.embedding[wordpairs[ii,1]][sense1]
            
            sense2 = WSD(ee, wordpairs[ii,2],contexts[ii,2];skip_oov=true)
            wv2 = ee.embedding[wordpairs[ii,2]][sense2]
            
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex)==KeyError
                warn(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end


get_sims (generic function with 2 methods)

In [16]:

local_sims = get_sims(ee, scws_wordpairs, scws_contexts)
corspearman(scws_groundsim, scws_local_sims)


global_syms = get_sims(am, scws_wordpairs)
corspearman(scws_groundsim, global_syms)


LoadError: LoadError: MethodError: no method matching get_sims(::WordEmbeddings.WordEmbedding, ::Array{String,2}, ::Array{Array{SubString,1},2})
Closest candidates are:
  get_sims(!Matched::WordEmbeddings.FixedWordSenseEmbedding, ::Array{String,2}, ::Array{T,2}) at In[15]:2
  get_sims(::Any, ::Array{String,2}) at In[11]:2
while loading In[16], in expression starting on line 2

In [17]:
global_syms = get_sims(ee, scws_wordpairs)
corspearman(scws_groundsim, global_syms)

2016-09-23T20:50:29.579 - warn: MethodError: Cannot `convert` an object of type Array{Any,1} to an object of type Array{Float32,2}
This may have arisen from a call to the constructor Array{Float32,2}(...),
since type constructors fall back to convert methods. prefix: "error: "
2016-09-23T20:50:30.524 - warn: MethodError: Cannot `convert` an object of type Array{Any,1} to an object of type Array{Float32,2}
This may have arisen from a call to the constructor Array{Float32,2}(...),
since type constructors fall back to convert methods. prefix: "error: "
2016-09-23T20:50:30.527 - warn: MethodError: Cannot `convert` an object of type Array{Any,1} to an object of type Array{Float32,2}
This may have arisen from a call to the constructor Array{Float32,2}(...),
since type constructors fall back to convert methods. prefix: "error: "
2016-09-23T20:50:30.53 - warn: MethodError: Cannot `convert` an object of type Array{Any,1} to an object of type Array{Float32,2}
This may have arisen from a call to 

0.5620437646722409

In [None]:
function synthesize_embedding(am::AdaGramModel,context::AbstractVector, word::AbstractString)
    known_context = filter(c->haskey(am.dict.word2id, c), context)
    sum(all_word_sense_vectors(am, word).*disambiguate(am.vm, am.dict, word, known_context, false))
end

In [None]:
_lp=Dict()

In [None]:
parse("""
foo(x)=4
""")

In [None]:
function normal_probs(logprobs::Vector)
    ret = copy(logprobs)
    max_lp = maximum(logprobs)
    ret.-=max_lp #Bring closer to zero
    map!(exp,ret)
    denom = sum(ret)
    ret./=denom
    ret
end


function synthesize_embedding2(ee,context::AbstractVector, word_or_phrase::AbstractString, smoothby=1)
    logprobs, wvs = get!(_lp, (ee, context, word_or_phrase)) do
        words = split(word_or_phrase, " ")
        wvs = vcat((all_word_sense_vectors(ee,w) for w in words)...)
        if length(wvs) == 0
                throw(KeyError("None of $words have embeddings"))
        end
        lps = [logprob_of_context(ee, context, wv; skip_oov=true, normalise_over_length=false) 
                 for wv in wvs]
        (lps, wvs)
    end
    slogprobs = logprobs./(smoothby*length(context))
    sum(normal_probs(slogprobs).*wvs)
end
    
    

"""
synthesize_embedding:: Embeddings, Context, Word -> WordVector
"""
    function get_sims_synth(ee, wordpairs::Matrix{String}, contexts::Matrix, smoothby=1)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)
        #@showprogress 
        for ii in 1:npairs
        try           
            wv1 = synthesize_embedding2(ee, contexts[ii,1],wordpairs[ii,1], smoothby)
            wv2 = synthesize_embedding2(ee, contexts[ii,2],wordpairs[ii,2], smoothby)
             
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex)==KeyError
                #warn(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end


In [None]:
sims = get_sims_synth(am, scws_wordpairs, scws_contexts, 0.5)
corspearman(scws_groundsim, sims)

In [None]:
corspearman(scws_groundsim, get_sims_synth(am, scws_wordpairs, windowed_scws_contexts,1))

In [None]:
corspearman(scws_groundsim, get_sims_synth(s_am, scws_wordpairs, scws_contexts, 1))

In [None]:
corspearman(scws_groundsim, get_sims_synth(s_am, scws_wordpairs, windowed_scws_contexts,1))

In [None]:
corspearman(scws_groundsim, get_sims_synth(s_am, scws_wordpairs, windowed_scws_contexts,1))

In [None]:
cor = corspearman(scws_groundsim, get_sims_synth(am, scws_wordpairs,scws_contexts, 2/e) )

In [None]:
cor = corspearman(scws_groundsim, get_sims_synth(am, scws_wordpairs,scws_contexts, 0.74))

In [None]:
r=[]
for ii in 0.1:0.5:100
    cor = corspearman(scws_groundsim, get_sims_synth(s_am, scws_wordpairs,scws_contexts, ii))
    push!(r,(ii,cor))
    println(ii,"\t", cor)
    flush(STDOUT)
end

In [None]:
using UnicodePlots

In [None]:
lineplot(first.(r), (x->x[2]).(r))



In [None]:
lineplot(first.(r), (x->x[2]).(r))

In [None]:
@show i=findmax((x->x[2]).(r))
r[i[2]]

In [None]:
# 0.5342692139056353     GlobalMin   plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.jld
# 0.48861817784625977    GlobalMin   adagram/more_senses.adagram_model.jld
# 0.45416228327202474    GlobalMin   sensemodels/adagram/semhuff_more_senses.adagram_model.jld"
# 0.523055629491102      GlobalMin   sensemodels/adagram/semhuff_more_senses.adagram_model.jld"

# 0.524743130161508      LocalSynth   greedy/tokenised_lowercase_WestburyLab.wikicorp.201004_300_i1.jl
# 0.5104268665100573     LocalSynth   sensemodels/adagram/semhuff_more_senses.adagram_model.jld"
# 0.6288843132192203     LocalSynth   adagram/more_senses.adagram_model.jld