In [1]:
using WordEmbeddings, SoftmaxClassifier
using ProgressMeter
using Utils
using Query
using WordStreams
using Distances
using StatsBase
using Training
using AdaGramCompat
using CorpusLoaders
using SwiftObjectStores
using JLD

using SenseAlignment

In [3]:
SwiftObjectStores.list(SwiftService(), "sensemodels")

Dict{Pair{Any,Any},Pair{Any,Any}} with 1 entry:
  Pair{Any,Any}("action","list_container_part") => Pair{Any,Any}("marker","")

In [17]:

ee = load("models/ss/tokenised_lowercase_WestburyLab.wikicorp.201004_300_i1.jld","ee");
#eep = get_jld(SwiftService(), "sensemodels/plain/", "tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.jld", "ee");
#ee = get_jld(SwiftService(), "sensemodels", "plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.semhuff.jld", "ee");

In [4]:
using AdaGram
using AdaGramCompat


In [None]:
am = load("models/adagram/more_senses.adagram_model.jld", "am");

In [None]:
;source "~/openrc.sh"

In [5]:
s_am = get_jld(SwiftService(), "sensemodels", "adagram/semhuff_more_senses.adagram_model.jld", "am");

In [None]:
function get_sims(ee, wordpairs::Matrix{String})
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try
            wvs_1::Matrix{Float32} = hcat(all_word_sense_vectors(ee,wordpairs[ii,1],"")...)
            wvs_2::Matrix{Float32} = hcat(all_word_sense_vectors(ee,wordpairs[ii,2],"")...)
            sims[ii]=1-minimum(pairwise(CosineDist(), wvs_1,wvs_2))
        catch ex
            warn(ex)
            sims[ii]=NaN
        end
    end
    return sims
end


In [None]:
wordpairs, groundsim = load_wordsim353("./data/corpora/wordsim353/combined.csv");
wordpairs=lowercase.(wordpairs)

@show corspearman(groundsim,get_sims(ee,wordpairs))
@show corspearman(groundsim,get_sims(am,wordpairs))
@show corspearman(groundsim,get_sims(s_am,wordpairs));

 - corspearman(groundsim,get_sims(ee,wordpairs)) = 0.5937382029721738, plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.semhuff.jld




In [7]:
function window_context(index::Integer, context::Vector, window_size::Integer)
    window_lower_bound = max(index - window_size÷2, 1)
    window_upper_bound = min(index + window_size÷2 -1 , length(context))
    view(context, window_lower_bound:window_upper_bound)
end
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), 2) == ["4", "6"]
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), 4) == ["3" , "4", "6", "7"]
Base.Test.@test window_context(2, split("1 3 4 5"), 4) == ["1","3","4"]
Base.Test.@test window_context(4, split("1 2 3 5"), 4) == ["2","3","5"]
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), typemax(Int)) == split("1 2 3 4 6 7 8")

Test Passed
  Expression: window_context(5,split("1 2 3 4 6 7 8"),typemax(Int)) == split("1 2 3 4 6 7 8")
   Evaluated: SubString{String}["1","2","3","4","6","7","8"] == SubString{String}["1","2","3","4","6","7","8"]

In [8]:
(scws_wordpairs, scws_groundsim, scws_contexts, scws_indexes) = CorpusLoaders.load_scws("./data/corpora/SCWS/ratings.txt");

scws_lc_contexts = map(x->lowercase.(x), scws_contexts)

scws_windowed_contexts = ((iis, cont) -> window_context(iis, cont, 10)).(scws_indexes, scws_contexts);
scws_lc_windowed_contexts = map(x->lowercase.(x), scws_windowed_contexts);

scws_lc_wordpairs = lowercase.(scws_wordpairs);


In [None]:
function get_sims(ee, wordpairs::Matrix{String}, contexts::Matrix)
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try 
            sense1 = WSD(ee, wordpairs[ii,1],contexts[ii,1];skip_oov=true)
            wv1 = ee.embedding[wordpairs[ii,1]][sense1]
            
            sense2 = WSD(ee, wordpairs[ii,2],contexts[ii,2];skip_oov=true)
            wv2 = ee.embedding[wordpairs[ii,2]][sense2]
            
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex)==KeyError
                warn(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end


In [None]:

#local_sims = get_sims(ee, scws_wordpairs, scws_contexts)
#corspearman(scws_groundsim, scws_local_sims)


global_syms = get_sims(am, scws_wordpairs)
corspearman(scws_groundsim, global_syms)


In [None]:
global_syms = get_sims(ee, scws_wordpairs)
corspearman(scws_groundsim, global_syms)

In [None]:
reload("SenseAlignment")

In [9]:
function get_avgsimc(ee, wordpairs::Matrix{String}, contexts::Matrix, normalise_over_context_lengths::Bool=true)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)

    @showprogress for ii in 1:npairs
        try 
            wvs1 = all_word_sense_vectors(ee,wordpairs[ii,1],"")
            wvs2 = all_word_sense_vectors(ee,wordpairs[ii,2],"")
            probs1 = general_wsd(ee, contexts[ii,1], wvs1;
                    normalise_over_context_length=normalise_over_context_lengths)
            probs2 = general_wsd(ee, contexts[ii,2], wvs2;
                    normalise_over_context_length=normalise_over_context_lengths)
            
            lprobs1 = log.(probs1)
            lprobs2 = log.(probs2)
            loffset = max(maximum(lprobs1),maximum(lprobs2))#Offset for stability
            @assert(isfinite(loffset), loffset)
            @assert(loffset<=0.0, loffset)
            
            score::Float64 = 0.0
            for (wv1,lprob1) in zip(wvs1, probs1)
                for (wv2,lprob2) in zip(wvs2, probs2)
                    d=1-cosine_dist(wv1,wv2)
                    lweight = lprob1+lprob2-loffset 
                    score+=exp(lweight)*d
                end
            end
            sims[ii] = (score*exp(loffset))/(length(wvs1)*length(wvs2))
        catch ex
            if typeof(ex) ∈ (KeyError, Query.NoContextError)   
                println(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end

get_avgsimc (generic function with 2 methods)

In [10]:
function get_sims_synth(ee, wordpairs::Matrix{String}, contexts::Matrix, normalise_over_context_lengths::Bool=true)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)

    @showprogress for ii in 1:npairs
        try 
            wv1::Vector{Float32} = SenseAlignment.synthesize_embedding(
                    ee, contexts[ii,1],wordpairs[ii,1],"",
                    normalise_over_context_lengths)
            
            wv2::Vector{Float32} = SenseAlignment.synthesize_embedding(
                    ee, contexts[ii,2],wordpairs[ii,2],"",
                    normalise_over_context_lengths)
             
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex) ∈ (KeyError, Query.NoContextError)   
                println(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end

get_sims_synth (generic function with 2 methods)

In [19]:
am=ee;

In [20]:
corspearman(scws_groundsim, 
get_sims_synth(am, scws_lc_wordpairs, scws_lc_windowed_contexts, false))

Progress:   0%|                                         |  ETA: 0:27:03KeyError(" SubString{String}[\"aglow\"], nor SubString{String}[\"\"] have embeddings")
Progress:   7%|███                                      |  ETA: 0:00:16KeyError(" SubString{String}[\"insufflate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  23%|█████████                                |  ETA: 0:00:07KeyError(" SubString{String}[\"anorgasmia\"], nor SubString{String}[\"\"] have embeddings")
Progress:  27%|███████████                              |  ETA: 0:00:06KeyError(" SubString{String}[\"backdate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  29%|████████████                             |  ETA: 0:00:06KeyError(" SubString{String}[\"notarize\"], nor SubString{String}[\"\"] have embeddings")
Progress:  42%|█████████████████                        |  ETA: 0:00:04KeyError(" SubString{String}[\"unblock\"], nor SubString{String}[\"\"] have embeddings")
Progress:  47%|███████████████████

0.40675642546098734

In [21]:
avgsimc_am_lc_windows_nosmooth = 
    get_avgsimc(am, scws_lc_wordpairs, scws_lc_windowed_contexts, false)
corspearman(scws_groundsim, avgsimc_am_lc_windows_nosmooth)

KeyError(" SubString{String}[\"aglow\"], nor SubString{String}[\"\"] have embeddings")
Progress:   7%|███                                      |  ETA: 0:00:06KeyError(" SubString{String}[\"insufflate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  23%|█████████                                |  ETA: 0:00:04KeyError(" SubString{String}[\"anorgasmia\"], nor SubString{String}[\"\"] have embeddings")
Progress:  27%|███████████                              |  ETA: 0:00:04KeyError(" SubString{String}[\"backdate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  29%|████████████                             |  ETA: 0:00:04KeyError(" SubString{String}[\"notarize\"], nor SubString{String}[\"\"] have embeddings")
Progress:  41%|█████████████████                        |  ETA: 0:00:03KeyError(" SubString{String}[\"unblock\"], nor SubString{String}[\"\"] have embeddings")
Progress:  48%|████████████████████                     |  ETA: 0:00:03KeyError(" SubString{String}[\"esc

0.28686105472877327

In [26]:
am.dimension

300

In [23]:
avgsimc_am_lc_windows_geosmooth = get_avgsimc(am, scws_lc_wordpairs, scws_lc_windowed_contexts, true)
corspearman(scws_groundsim, avgsimc_am_lc_windows_geosmooth)

KeyError(" SubString{String}[\"aglow\"], nor SubString{String}[\"\"] have embeddings")
Progress:   6%|██                                       |  ETA: 0:00:07KeyError(" SubString{String}[\"insufflate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  23%|█████████                                |  ETA: 0:00:05KeyError(" SubString{String}[\"anorgasmia\"], nor SubString{String}[\"\"] have embeddings")
Progress:  27%|███████████                              |  ETA: 0:00:04KeyError(" SubString{String}[\"backdate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  29%|████████████                             |  ETA: 0:00:04KeyError(" SubString{String}[\"notarize\"], nor SubString{String}[\"\"] have embeddings")
Progress:  41%|█████████████████                        |  ETA: 0:00:04KeyError(" SubString{String}[\"unblock\"], nor SubString{String}[\"\"] have embeddings")
Progress:  48%|████████████████████                     |  ETA: 0:00:04KeyError(" SubString{String}[\"esc

0.27588873113073076

In [24]:
corspearman(scws_groundsim, 
get_sims_synth(am, scws_lc_wordpairs, scws_lc_windowed_contexts, true))
#0.650271346861422 Adagram

KeyError(" SubString{String}[\"aglow\"], nor SubString{String}[\"\"] have embeddings")
Progress:   6%|██                                       |  ETA: 0:00:05KeyError(" SubString{String}[\"insufflate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  22%|█████████                                |  ETA: 0:00:04KeyError(" SubString{String}[\"anorgasmia\"], nor SubString{String}[\"\"] have embeddings")
Progress:  27%|███████████                              |  ETA: 0:00:03KeyError(" SubString{String}[\"backdate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  28%|████████████                             |  ETA: 0:00:03KeyError(" SubString{String}[\"notarize\"], nor SubString{String}[\"\"] have embeddings")
Progress:  41%|█████████████████                        |  ETA: 0:00:03KeyError(" SubString{String}[\"unblock\"], nor SubString{String}[\"\"] have embeddings")
Progress:  46%|███████████████████                      |  ETA: 0:00:05KeyError(" SubString{String}[\"esc

0.4970380445510214

In [None]:
corspearman(scws_groundsim, 
get_avgsimc(am, scws_lc_wordpairs, scws_lc_contexts, true))

In [None]:
corspearman(scws_groundsim, 
get_avgsimc(am, scws_lc_wordpairs, scws_lc_contexts, false))
#0.42377281027185104

In [None]:
corspearman(scws_groundsim, 
get_sims_synth(am, scws_lc_wordpairs, scws_lc_contexts, true))
#0.6452188028304504

Progress:   8%|███                                      |  ETA: 0:00:29KeyError(" SubString{String}[\"insufflate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  27%|███████████                              |  ETA: 0:00:22KeyError(" SubString{String}[\"backdate\"], nor SubString{String}[\"\"] have embeddings")
Progress: 100%|█████████████████████████████████████████| Time: 0:00:31


0.5246294815842275

In [None]:
# 0.5342692139056353     GlobalMin   plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.jld
# 0.48861817784625977    GlobalMin   adagram/more_senses.adagram_model.jld
# 0.45416228327202474    GlobalMin   sensemodels/adagram/semhuff_more_senses.adagram_model.jld"
# 0.523055629491102      GlobalMin   sensemodels/adagram/semhuff_more_senses.adagram_model.jld"

# 0.524743130161508      LocalSynth   greedy/tokenised_lowercase_WestburyLab.wikicorp.201004_300_i1.jl
# 0.5104268665100573     LocalSynth   sensemodels/adagram/semhuff_more_senses.adagram_model.jld"
# 0.6288843132192203     LocalSynth   adagram/more_senses.adagram_model.jld