In [1]:
using WordEmbeddings, SoftmaxClassifier
using ProgressMeter
using Utils
using Query
using WordStreams
using Distances
using StatsBase
using Training
using AdaGramCompat
using CorpusLoaders
using SwiftObjectStores
using JLD

using SenseAlignment

In [None]:
SwiftObjectStores.list(SwiftService(), "sensemodels")

In [None]:

ee = load("models/ss/tokenised_lowercase_WestburyLab.wikicorp.201004_300_i1.jld","ee");
#eep = get_jld(SwiftService(), "sensemodels/plain/", "tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.jld", "ee");
#ee = get_jld(SwiftService(), "sensemodels", "plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.semhuff.jld", "ee");

In [2]:
using AdaGram
using AdaGramCompat


In [3]:
am = load("models/adagram/more_senses.adagram_model.jld", "am");

In [None]:
;source "~/openrc.sh"

In [None]:
am.dict.id2word |> length

In [None]:
s_am = get_jld(SwiftService(), "sensemodels", "adagram/semhuff_more_senses.adagram_model.jld", "am");

In [None]:
function get_sims(ee, wordpairs::Matrix{String})
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try
            wvs_1::Matrix{Float32} = hcat(all_word_sense_vectors(ee,wordpairs[ii,1],"")...)
            wvs_2::Matrix{Float32} = hcat(all_word_sense_vectors(ee,wordpairs[ii,2],"")...)
            sims[ii]=1-minimum(pairwise(CosineDist(), wvs_1,wvs_2))
        catch ex
            warn(ex)
            sims[ii]=NaN
        end
    end
    return sims
end


In [None]:
wordpairs, groundsim = load_wordsim353("./data/corpora/wordsim353/combined.csv");
wordpairs=lowercase.(wordpairs)

@show corspearman(groundsim,get_sims(ee,wordpairs))
@show corspearman(groundsim,get_sims(am,wordpairs))
@show corspearman(groundsim,get_sims(s_am,wordpairs));

 - corspearman(groundsim,get_sims(ee,wordpairs)) = 0.5937382029721738, plain/tokenised_lowercase_WestburyLab.wikicorp.201004_100_nosubsample.semhuff.jld




In [5]:
function window_context(index::Integer, context::Vector, window_size::Integer)
    window_lower_bound = max(index - window_size÷2, 1)
    window_upper_bound = min(index + window_size÷2 -1 , length(context))
    view(context, window_lower_bound:window_upper_bound)
end
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), 2) == ["4", "6"]
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), 4) == ["3" , "4", "6", "7"]
Base.Test.@test window_context(2, split("1 3 4 5"), 4) == ["1","3","4"]
Base.Test.@test window_context(4, split("1 2 3 5"), 4) == ["2","3","5"]
Base.Test.@test window_context(5, split("1 2 3 4 6 7 8"), typemax(Int)) == split("1 2 3 4 6 7 8")

Test Passed
  Expression: window_context(5,split("1 2 3 4 6 7 8"),typemax(Int)) == split("1 2 3 4 6 7 8")
   Evaluated: SubString{String}["1","2","3","4","6","7","8"] == SubString{String}["1","2","3","4","6","7","8"]

In [6]:
(scws_wordpairs, scws_groundsim, scws_contexts, scws_indexes) = CorpusLoaders.load_scws("./data/corpora/SCWS/ratings.txt");

scws_lc_contexts = map(x->lowercase.(x), scws_contexts)

scws_windowed_contexts = ((iis, cont) -> window_context(iis, cont, 10)).(scws_indexes, scws_contexts);
scws_lc_windowed_contexts = map(x->lowercase.(x), scws_windowed_contexts);

scws_lc_wordpairs = lowercase.(scws_wordpairs);


In [None]:
function get_sims(ee, wordpairs::Matrix{String}, contexts::Matrix)
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try 
            sense1 = WSD(ee, wordpairs[ii,1],contexts[ii,1];skip_oov=true)
            wv1 = ee.embedding[wordpairs[ii,1]][sense1]
            
            sense2 = WSD(ee, wordpairs[ii,2],contexts[ii,2];skip_oov=true)
            wv2 = ee.embedding[wordpairs[ii,2]][sense2]
            
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex)==KeyError
                warn(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end


In [None]:

#local_sims = get_sims(ee, scws_wordpairs, scws_contexts)
#corspearman(scws_groundsim, scws_local_sims)


global_syms = get_sims(am, scws_wordpairs)
corspearman(scws_groundsim, global_syms)


In [None]:
global_syms = get_sims(ee, scws_wordpairs)
corspearman(scws_groundsim, global_syms)

In [None]:
reload("SenseAlignment")

In [None]:
function get_avgsimc(ee, wordpairs::Matrix{String}, contexts::Matrix, normalise_over_context_lengths::Bool=true)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)

    @showprogress for ii in 1:npairs
        try 
            wvs1 = all_word_sense_vectors(ee,wordpairs[ii,1],"")
            wvs2 = all_word_sense_vectors(ee,wordpairs[ii,2],"")
            probs1 = general_wsd(ee, contexts[ii,1], wvs1;
                    normalise_over_context_length=normalise_over_context_lengths)
            probs2 = general_wsd(ee, contexts[ii,2], wvs2;
                    normalise_over_context_length=normalise_over_context_lengths)
            
            lprobs1 = log.(probs1)
            lprobs2 = log.(probs2)
            loffset = max(maximum(lprobs1),maximum(lprobs2))#Offset for stability
            @assert(isfinite(loffset), loffset)
            @assert(loffset<=0.0, loffset)
            
            score::Float64 = 0.0
            for (wv1,lprob1) in zip(wvs1, lprobs1)
                for (wv2,lprob2) in zip(wvs2, lprobs2)
                    d=1-cosine_dist(wv1,wv2)
                    lweight = lprob1+lprob2-loffset 
                    score+=exp(lweight)*d
                end
            end
            sims[ii] = (score*exp(loffset))/(length(wvs1)*length(wvs2))
        catch ex
            if typeof(ex) ∈ (KeyError, Query.NoContextError)   
                println(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end

In [17]:
function get_sims_synth(ee, wordpairs::Matrix{String}, contexts::Matrix;
         normalise_over_context_length::Bool=true,
         normalize_over_prior::Bool=false,
         use_prior::Bool= false
    )
    @show use_prior
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)

    @showprogress for ii in 1:npairs
        try 
            wv1::Vector{Float32} = SenseAlignment.synthesize_embedding(
                    ee, contexts[ii,1],wordpairs[ii,1],"",
                    normalise_over_context_length=normalise_over_context_length,
                    normalize_over_prior=normalize_over_prior,
                    use_prior=use_prior
            )
            
            wv2::Vector{Float32} = SenseAlignment.synthesize_embedding(
                    ee, contexts[ii,2],wordpairs[ii,2],"";
                    normalise_over_context_length=normalise_over_context_length,
                    normalize_over_prior=normalize_over_prior,
                    use_prior=use_prior)
             
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex) ∈ (KeyError, Query.NoContextError)   
                println(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end

get_sims_synth (generic function with 1 method)

In [21]:
methods(synthesize_embedding)

In [20]:
corspearman(scws_groundsim, 
get_sims_synth(am, scws_lc_wordpairs, scws_lc_windowed_contexts,
normalise_over_context_length=true,
normalize_over_prior=false,
use_prior=true
))

0.6502712538816777

use_prior = true
Progress:   8%|███                                      |  ETA: 0:00:34KeyError(" SubString{String}[\"insufflate\"], nor SubString{String}[\"\"] have embeddings")
Progress:  16%|███████                                  |  ETA: 0:00:30

LoadError: LoadError: InterruptException:
while loading In[20], in expression starting on line 1

In [None]:
avgsimc_am_lc_windows_nosmooth = 
get_avgsimc(ee, scws_lc_wordpairs, scws_lc_windowed_contexts, false)
corspearman(scws_groundsim, avgsimc_am_lc_windows_nosmooth)

In [None]:
am.dimension

In [None]:
avgsimc_am_lc_windows_geosmooth = get_avgsimc(am, scws_lc_wordpairs, scws_lc_windowed_contexts, true)
corspearman(scws_groundsim, avgsimc_am_lc_windows_geosmooth)

In [None]:
corspearman(scws_groundsim, 
get_sims_synth(am, scws_lc_wordpairs, scws_lc_windowed_contexts, true))
#0.650271346861422 Adagram

In [None]:
corspearman(scws_groundsim, 
get_avgsimc(ee, scws_lc_wordpairs, scws_lc_contexts, true))

In [None]:
corspearman(scws_groundsim, 
get_avgsimc(ee, scws_lc_wordpairs, scws_lc_contexts, false))
#0.42377281027185104

In [None]:
corspearman(scws_groundsim, 
get_sims_synth(am, scws_lc_wordpairs, scws_lc_contexts, true))
#0.6452188028304504

In [None]:
function get_priored_synthsim(ee, wordpairs::Matrix{String},
                                contexts::Matrix)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)

    @showprogress for ii in 1:npairs
        try 
            wvs1::Vector{Vector{Float32}} = all_word_sense_vectors(ee,wordpairs[ii,1],"")
            
            wvs2 = all_word_sense_vectors(ee,wordpairs[ii,2],"")
            prior1 = expected_pi(am.vm,am.dict.word2id[wordpairs[ii,1]])
            prior2 = expected_pi(am.vm,am.dict.word2id[wordpairs[ii,2]])
            
            
            probs1 = general_wsd(ee, contexts[ii,1], wvs1, prior1;
            normalise_over_context_length=false)
            probs2 = general_wsd(ee, contexts[ii,2], wvs2, prior2;
            normalise_over_context_length=false)
            
            context_length1=sum(haskey(am.dict.word2id,x) for x in contexts[ii,1])
            context_length2=sum(haskey(am.dict.word2id,x) for x in contexts[ii,2])
                    
            lprobs1 = log.(probs1)./context_length1
            lprobs2 = log.(probs2)./context_length2
            loffset1 = maximum(lprobs1) #Offset for stability
            loffset2 = maximum(lprobs2) #Offset for stability
            @assert(isfinite(loffset1), loffset1)
            @assert(loffset1<=0.0, loffset1)
            @assert(isfinite(loffset1), loffset1)
            @assert(loffset1<=0.0, loffset1)
            
            score::Float64 = 0.0
            final_wv1=exp(loffset1).*sum((exp.(lprobs1.-loffset1)) .* wvs1 ) 
            final_wv2=exp(loffset2).*sum((exp.(lprobs2.-loffset2)) .* wvs2 )
            
            sims[ii] = 1-cosine_dist(final_wv1, final_wv2)
        catch ex
            if typeof(ex) ∈ (KeyError, Query.NoContextError)   
                println(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end

In [None]:
corspearman(scws_groundsim, 
get_priored_synthsim(am, scws_lc_wordpairs, scws_lc_windowed_contexts))


In [None]:
function get_noised_synthsim(ee, wordpairs::Matrix{String}, contexts::Matrix, extendby::Integer, normalise_over_context_lengths::Bool=true)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)

    @showprogress for ii in 1:npairs
        try 
            wvs1::Vector{Vector{Float32}} = all_word_sense_vectors(ee,wordpairs[ii,1],"")
            len = length(first(wvs1))
            var1 = var(hcat(wvs1...))
            wvs1 = [wvs1;[randn(Float32, len).*3*var1+wv 
                          for wv in wvs1 for _ in 1:extendby]]
            
            wvs2 = all_word_sense_vectors(ee,wordpairs[ii,2],"")
            var2 = var(hcat(wvs2...))
            wvs2 = [wvs2;[randn(Float32, len).*3*var2+wv
                          for wv in wvs2 for _ in 1:extendby]]
            
            probs1 = general_wsd(ee, contexts[ii,1], wvs1;
                    normalise_over_context_length=normalise_over_context_lengths)
            probs2 = general_wsd(ee, contexts[ii,2], wvs2;
                    normalise_over_context_length=normalise_over_context_lengths)
            
            
            
            lprobs1 = log.(probs1)
            lprobs2 = log.(probs2)
            loffset1 = maximum(lprobs1) #Offset for stability
            loffset2 = maximum(lprobs2) #Offset for stability
            @assert(isfinite(loffset1), loffset1)
            @assert(loffset1<=0.0, loffset1)
            @assert(isfinite(loffset1), loffset1)
            @assert(loffset1<=0.0, loffset1)
            
            score::Float64 = 0.0
            final_wv1=exp(loffset1).*sum((exp.(lprobs1.-loffset1)) .* wvs1 ) 
            final_wv2=exp(loffset2).*sum((exp.(lprobs2.-loffset2)) .* wvs2 )
            
            sims[ii] = 1-cosine_dist(final_wv1, final_wv2)
        catch ex
            if typeof(ex) ∈ (KeyError, Query.NoContextError)   
                println(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end

In [None]:
corspearman(scws_groundsim, 
get_noised_synthsim(am, scws_lc_wordpairs, scws_lc_windowed_contexts, 3, true))


In [None]:
var(hcat(all_word_sense_vectors(am,"fire","")...),2) |> size

In [None]:
function tt()
    extendby=3
    wvs1::Vector{Vector{Float32}} = all_word_sense_vectors(am,"fire","")
    len = length(first(wvs1))
    wvs1 = [wvs1;[randn(Float32, len)*0.5+wv 
            for wv in wvs1 for _ in 1:extendby]]

    @show length(wvs1)
    @show length(wvs1|>first)
    @show typeof(wvs1)
end
tt()

In [None]:
end[a;((randn(length(a))+a for wv in a))]