In [24]:
using ProgressMeter

In [1]:
using NearestNeighbors
using WordEmbeddings, SoftmaxClassifier
using Utils
using Query
using Base.Collections
using WordStreams
using DataStructures
using Distances
using Iterators
using StatsBase
using Training

In [2]:
using JLD
ee = load("models/ss/tokenised_lowercase_WestburyLab.wikicorp.201004_300_i1.jld","ee");
#dtree,labels = nn_tree(ee)
#""

In [12]:
using AdaGram
using AdaGramCompat
am = load("models/adagram/more_senses.adagram_model.jld", "am");


In [3]:
function load_wordsim353(filename="data/corpora/wordsim353/combined.csv")
    wordsims = readdlm(filename, ','; skipstart=1)
    wordpairs = convert(Matrix{String},wordsims[:,1:2])
    groundsim = convert(Vector{Float64},wordsims[:,3])
    (wordpairs, groundsim)
end


load_wordsim353 (generic function with 2 methods)

In [5]:
function get_sims(ee::FixedWordSenseEmbedding, wordpairs::Matrix{String})
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try
            wvs_1 = hcat(ee.embedding[wordpairs[ii,1]]...)
            wvs_2 = hcat(ee.embedding[wordpairs[ii,2]]...)
            sims[ii]=1-minimum(pairwise(CosineDist(), wvs_1,wvs_2))
        catch ex
            warn(ex)
            sims[ii]=NaN
        end
    end
    return sims
end


get_sims (generic function with 1 method)

In [6]:
wordpairs, groundsim = load_wordsim353();

sims = get_sims(ee,map(lowercase,wordpairs))
corspearman(groundsim,sims)

0.49821807559772224

In [7]:
function load_SCWS(filename = "data/corpora/SCWS/ratings.txt")
    function get_context2(fulltext)
        Task() do
            skip_next = 0
            for word in split(fulltext, ' ')
                skip_next-=1
                if word == "<b>"
                    skip_next=3 
                end
                skip_next>0 && continue
                produce(lowercase(word))
            end
        end |> collect
    end
    entries = readdlm(filename, '\t', String; quotes=false, comments=false)
    wordpairs = map(lowercase,entries[:,[2,4]])
    groundsim = map(s->parse(Float64,s), entries[:,8])
    contexts = map(get_context2,entries[:,[6,7]])
    (wordpairs,contexts,groundsim,)
end


load_SCWS (generic function with 2 methods)

In [10]:
function get_sims(ee::FixedWordSenseEmbedding, wordpairs::Matrix{String}, contexts::Matrix)
    sims = Vector{Float64}(size(wordpairs,1))
    for ii in 1:size(wordpairs,1)
        try
            
            sense1 = WSD(ee, wordpairs[ii,1],contexts[ii,1];skip_oov=true)
            wv1 = ee.embedding[wordpairs[ii,1]][sense1]
            
            sense2 = WSD(ee, wordpairs[ii,2],contexts[ii,2];skip_oov=true)
            wv2 = ee.embedding[wordpairs[ii,2]][sense2]
            
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex)==KeyError
                warn(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end


get_sims (generic function with 2 methods)

In [11]:
wordpairs,contexts,groundsim = load_SCWS();
local_sims = get_sims(ee,wordpairs,contexts)
corspearman(groundsim,local_sims)

2016-09-02T22:19:06.124 - warn: KeyError: key "aglow" not found prefix: "error: "
2016-09-02T22:19:08.49 - warn: KeyError: key "insufflate" not found prefix: "error: "
2016-09-02T22:19:13.071 - warn: KeyError: key "anorgasmia" not found prefix: "error: "
2016-09-02T22:19:14.341 - warn: KeyError: key "backdate" not found prefix: "error: "
2016-09-02T22:19:14.911 - warn: KeyError: key "notarize" not found prefix: "error: "
2016-09-02T22:19:19.579 - warn: KeyError: key "unblock" not found prefix: "error: "
2016-09-02T22:19:21.777 - warn: KeyError: key "escapologist" not found prefix: "error: "
2016-09-02T22:19:22.537 - warn: KeyError: key "tombac" not found prefix: "error: "
2016-09-02T22:19:22.716 - warn: KeyError: key "nutriment" not found prefix: "error: "
2016-09-02T22:19:27.997 - warn: KeyError: key "bated" not found prefix: "error: "
2016-09-02T22:19:29.433 - warn: KeyError: key "cosmographer" not found prefix: "error: "
2016-09-02T22:19:29.788 - warn: KeyError: key "exudation" not 

0.39415271981504646

In [13]:
function all_word_sense_vectors(ee::WordSenseEmbedding, word)
    get(ee.embedding, word, Vector{Float32}[])
end

function all_word_sense_vectors(am::AdaGramCompat.AdaGramModel, word)
    if haskey(am.dict.word2id, word)
        wsv_mat = word_sense_vectors(am, word)
        [view(wsv_mat,:,ii) for ii in 1:size(wsv_mat,2)]
    else
        Vector{Float32}[]
    end
end

all_word_sense_vectors (generic function with 2 methods)

In [27]:
function normal_probs(logprobs::Vector)
    ret = copy(logprobs)
    max_lp = maximum(logprobs)
    ret.-=max_lp #Bring closer to zero
    map!(exp,ret)
    denom = sum(ret)
    ret./=denom
    ret
end
function weighted_average(logprobs, embeddings)
    ret = zeros(first(embeddings))
    for (weight, embedding) in zip(normal_probs(logprobs), embeddings)
        ret.+= weight.*embedding
    end
    ret
end


function synthesize_embedding(ee,context::AbstractVector, word_or_phrase::AbstractString)
    words = split(word_or_phrase, " ")
    wvs = vcat((all_word_sense_vectors(ee,w) for w in words)...)
    if length(wvs) == 0
            throw(KeyError("None of $words have embeddings"))
    end
    logprobs = [Query.logprob_of_context(ee, context, wv; skip_oov=true, normalise_over_length=true) for wv in wvs]
    weighted_average(logprobs, wvs)
end

"""
synthesize_embedding:: Embeddings, Context, Word -> WordVector
"""
    function get_sims_synth(ee, wordpairs::Matrix{String}, contexts::Matrix)
    sims = Vector{Float64}(size(wordpairs,1))
    npairs = size(wordpairs,1)
        @showprogress for ii in 1:npairs
        try           
            wv1 = synthesize_embedding(ee, contexts[ii,1],wordpairs[ii,1])
            wv2 = synthesize_embedding(ee, contexts[ii,2],wordpairs[ii,2])
             
            sims[ii]=1-cosine_dist(wv1,wv2)
        catch ex
            if typeof(ex)==KeyError
                warn(ex)
                sims[ii]=NaN
            else
                rethrow(ex)
            end
        end
    end
    return sims
end


get_sims_synth

In [28]:
corspearman(groundsim,get_sims_synth(ee,wordpairs,contexts))

Progress:   0%|                                         |  ETA: 0:04:442016-09-02T22:41:36.049 - warn: KeyError: key "None of SubString{String}[\"aglow\"] have embeddings" not found prefix: "error: "
Progress:   8%|███                                      |  ETA: 0:00:342016-09-02T22:41:38.759 - warn: KeyError: key "None of SubString{String}[\"insufflate\"] have embeddings" not found prefix: "error: "
Progress:  23%|██████████                               |  ETA: 0:00:272016-09-02T22:41:44.106 - warn: KeyError: key "None of SubString{String}[\"anorgasmia\"] have embeddings" not found prefix: "error: "
Progress:  27%|███████████                              |  ETA: 0:00:252016-09-02T22:41:45.51 - warn: KeyError: key "None of SubString{String}[\"backdate\"] have embeddings" not found prefix: "error: "
Progress:  29%|████████████                             |  ETA: 0:00:252016-09-02T22:41:46.145 - warn: KeyError: key "None of SubString{String}[\"notarize\"] have embeddings" not found pre

0.524743130161508

In [29]:
corspearman(groundsim,get_sims_synth(am,wordpairs,contexts))

Progress:   8%|███                                      |  ETA: 0:04:332016-09-02T22:42:36.131 - warn: KeyError: key "None of SubString{String}[\"insufflate\"] have embeddings" not found prefix: "error: "
Progress:  28%|███████████                              |  ETA: 0:03:312016-09-02T22:43:33.348 - warn: KeyError: key "None of SubString{String}[\"backdate\"] have embeddings" not found prefix: "error: "
Progress: 100%|█████████████████████████████████████████| Time: 0:04:52


0.6452187057383217