In [1]:
using ProgressMeter
using SwiftObjectStores
using CorpusLoaders
using WordNet
using AdaGram
using AdaGramCompat
using WordEmbeddings, SoftmaxClassifier
using Utils
using Query
using Distances
using JLD

using SenseAlignment

importfrom(CorpusLoaders, :sensekey)
importfrom(WordNet, :sensekey)

In [2]:
const WN_PATH="data/corpora/WordNet-2.1/"
#WN_PATH3 = "/usr/share/nltk_data/corpora/wordnet/"
db = DB(WN_PATH)

WordNet.DB

In [3]:
#am = AdaGramCompat.AdaGramModel(load_model("models/adagram/v1_d100.adagram_model")...)
#am = AdaGramModel(load_model("models/adagram/more_senses.adagram_model")...)
#am = open(deserialize,"models/adagram/more_senses.adagram_model.jsz", "r");
am = load("models/adagram/more_senses.adagram_model.jld", "am");

In [None]:
sum(length.(collect(values(ee.embedding))).>1)

In [None]:
s_am = get_jld(SwiftService(), "sensemodels", "adagram/semhuff_more_senses.adagram_model.jld", "am");

In [82]:
#ee = restore("models/ss/tokenised_lowercase_WestburyLab.wikicorp.201004_100_i1.jld");
ee = load("models/ss/tokenised_lowercase_WestburyLab.wikicorp.201004_100_i1.jld", "ee")

#ee = restore("models/ss/keep/tokenised_lowercase_WestburyLab.wikicorp.201004_50__m170000000.model");

In [4]:
challenges = lazyload_challenges_semeval2007t7("data/corpora/wsd/semeval2007_t7/test/eng-coarse-all-words.xml",
                                            10, x->!isalnum(x)) |> collect;
solutions = load_solutions_semeval2007t7("data/corpora/wsd/semeval2007_t7/key/dataset21.test.key");

#semcor = index_semcor(lazyload_semcor("data/corpora/semcor2.1/brown1/tagfiles/"));

In [None]:
semcor = index_semcor([load_semcor("data/corpora/semcor2.1/brown1/tagfiles/")
    load_semcor("data/corpora/semcor2.1/brown2/tagfiles/");
    load_semcor("data/corpora/semcor2.1/brownv/")
    ]);


In [None]:
cs = db.counts |> length
ws = db.sensekeys|> length
@show cs
@show ws
@show ws - cs
@show (ws - cs)/ws 


In [None]:
challenges[1]

In [26]:
function window(context, index::Int, window_size::Int=10)
    window_lower_bound = max(index - window_size÷2, 1)
    window_upper_bound = min(index + window_size÷2, length(context))
    view(context, [window_lower_bound:index-1 ; index+1:window_upper_bound])
end

function window(tagged_sense::TaggedSentence, index::Int, window_size::Int=10)
    context = lowercase.(strip_tags(tagged_sense))
    window(context, index, window_size)
end
   
function window(context, word::AbstractString, window_size::Int=10)
    context = lowercase.(context)
    occurances = find(context.==word)
    if length(occurances) > 0
        index =  occurances[ceil(Int, end/2)]
        window(context, index, window_size)
    else
        context  #don't window
    end
end

window (generic function with 6 methods)

In [27]:

"Collect up the usages from a indexed tagged source"
function get_usages(usage_index::SemcorIndex, key)
    if haskey(usage_index, key)
        [window(lowercase.(strip_tags(sent)), index) for (sent, index) in usage_index[key]]
    else
        Vector{String}[]
    end
end

function get_usages(synset::Synset, lemma_word::AbstractString)
    gloss::Vector{SubString{String}} = lowercase.(punctuation_space_tokenize(synset.gloss))
    #[window(gloss, lemma_word)]
    [gloss]
end

function get_all_usages(wn::DB, lemma_word, pos)   
    lemma = db[pos, lemma_word]
    target_synsets::Vector{Synset} = synsets(db, lemma)
    
    Dict{Synset,AbstractVector{AbstractVector}}((synset => get_usages(synset, lemma_word)
        #[get_usages(semcor, sensekey(db, synset, lemma)); get_usages(synset, lemma_word)]
                    for synset in target_synsets))  
    
end

get_all_usages (generic function with 1 method)

In [69]:
all_identical(col) = length(col)==1 || !any(x->x!=col[1],col[2:end])

function _lexically_informed_embeddings(wn::DB,ee, word,lemma_word, pos;kw_args... )
    target_synset_examples = get_all_usages(wn, lemma_word, pos)
    target_synsets = collect(keys(target_synset_examples))
    lem = db[lemma_word, pos]
        
    embeddings = Vector{Vector{Float32}}(length(target_synsets))
    for (ii,(synset, examples)) in enumerate(target_synset_examples)
        context::Vector{SubString{String}} = vcat(examples...)
        @assert context |> length > 0 
        embeddings[ii] =synthesize_embedding(ee,context, word, lemma_word;kw_args...)
    end
        
    
    if length(target_synsets)!=1 && all_identical(embeddings)
        throw(KeyError("Only 1 embedding for $word"))
    end
        
    embeddings,target_synsets,lem
end

    
_li_embeddings = Dict{Any, Tuple{Vector{Vector{Float32}}, Vector{Synset}, Lemma}}()
function lexically_informed_embeddings(args...;kwargs... )
get!(_li_embeddings, (args,kwargs)) do
        _lexically_informed_embeddings(args...;kwargs...)
    end
end

lexically_informed_embeddings (generic function with 1 method)

In [85]:
function supervised_wsd(challenge, ee, wn::DB; 
     normalise_over_context_length::Bool=true,
     normalize_over_prior::Bool=false,
     use_prior_for_alignment::Bool= false,
     MFS_backoff::Bool = false
    )
    try
        embeddings,target_synsets,lem = lexically_informed_embeddings(wn,ee,
                                                            challenge.word, 
                                                            challenge.lemma, challenge.pos; 
                                                            use_prior = use_prior_for_alignment,
                                                            normalise_over_context_length = normalise_over_context_length,
                                                            normalize_over_prior = normalize_over_prior,
                                                            
        )
        
        priors = [float(sensecount(db, ss, lem)) for ss in target_synsets] #TODO use thing
        priors += 1#length(priors)
        #priors .+= sqrt(sum(priors))/length(priors)
        priors ./= sum(priors)
        @assert(challenge.context |> length >0, challenge)
        context = lowercase.(challenge.context)
        probs_of_sense = general_wsd(ee, context, embeddings, priors; 
                                    normalise_over_context_length = normalise_over_context_length,
                                    normalize_over_prior = normalize_over_prior
        )
               
     
        sense_index= indmax(probs_of_sense)        
        synset = target_synsets[sense_index]
        sk = sensekey(wn, synset, lem)
        Nullable(sk)
    catch ex
        if MFS_backoff
            most_frequent_sense(challenge, wn)
        else
            isa(ex, KeyError) || rethrow(ex)
            #@show ex
            Nullable{String}()
        end
    end
end

function most_frequent_sense(challenge, wn::DB)
    try
        lem = wn[challenge.pos, challenge.lemma]
        target_synsets::Vector{Synset} = synsets(db, lem)
        
        sense_freqs =  Float32[sensecount(db, ss, lem) for ss in target_synsets]
        sense_index= indmax(sense_freqs)
        synset = target_synsets[sense_index]
        
        sk = sensekey(wn, synset,  lem)
        Nullable(sk)
    catch ex
        isa(ex, KeyError) || rethrow(ex)
        Nullable{String}()
    end
end



most_frequent_sense (generic function with 1 method)

In [86]:
function evalute_on_wsd_challenge(challenges, solutions, method)
    correct = 0
    incorrect = 0
    notattempted = 0
    @showprogress for (challenge, ground_solution) in zip(challenges, solutions)
        assert(challenge.id == ground_solution.id)
        output_sense = method(challenge)
        if isnull(output_sense)
            notattempted+=1
        else
            if get(output_sense) ∈ ground_solution.solutions
                correct+=1
            else
                incorrect+=1
            end
        end
    end
    @show notattempted
    precision = correct/(correct+incorrect)
    recall = correct/(correct+incorrect+notattempted)
    f1 = (2*precision*recall) / (precision+recall)
    return precision, recall, f1
end
    
    
function evalute_on_wsd_challenge(challenges, solutions, ee, wn::DB; kwargs...)
    method = challenge -> supervised_wsd(challenge, ee, wn; kwargs...)
    evalute_on_wsd_challenge(challenges, solutions, method)
end

function evalute_on_wsd_challenge_MFS(challenges, solutions, wn::DB)
    method = challenge -> most_frequent_sense(challenge, wn)
    evalute_on_wsd_challenge(challenges, solutions, method)
end

evalute_on_wsd_challenge_MFS (generic function with 1 method)

In [87]:
evalute_on_wsd_challenge(challenges, solutions, am, db;   
    normalise_over_context_length = true,
    normalize_over_prior = false,
    use_prior_for_alignment = false)
#(0.7825319805910895,0.7818422212428383,0.7821869488536156) #using first 5 words of gloss if word not found
#(0.7847375385972651,0.7840458351696783,0.7843915343915344) #using whole gloss if word not found
#(0.7834142037935597	0.7827236668135743	0.7830687830687831) #using whole gloss always

Progress:   1%|                                         |  ETA: 0:04:46

LoadError: LoadError: InterruptException:
while loading In[87], in expression starting on line 1

In [None]:
synthesize_embedding()

In [65]:
all_word_sense_priors(am,"six","")

30-element Array{Float64,1}:
 0.0508993  
 0.0746286  
 0.0819043  
 0.0334354  
 0.0699029  
 0.133938   
 0.0746971  
 0.0361085  
 0.032563   
 0.0735142  
 0.102609   
 0.0471056  
 0.0558083  
 0.0865038  
 0.046381   
 8.65113e-7 
 1.7389e-7  
 3.52441e-8 
 7.14442e-9 
 1.44827e-9 
 2.93583e-10
 5.95131e-11
 1.20641e-11
 2.44555e-12
 4.95744e-13
 1.00494e-13
 2.03714e-14
 4.12955e-15
 8.37114e-16
 2.12839e-16

In [68]:
synsets(db,db["six",'n'])

1-element Array{WordNet.Synset,1}:
 (n) VI, hexad, 6, six, sextuplet, Captain Hicks, half a dozen, sixer, sise, sextet, sestet (the cardinal number that is the sum of five and one)

In [72]:
am.vm.In[:, :, am.dict.word2id["six"]];

In [91]:
evalute_on_wsd_challenge(challenges, solutions, ee, db;  
MFS_backoff = false,
normalise_over_context_length = false,
normalize_over_prior = false,
use_prior_for_alignment = true
) |> x->join(x,"\t") |> println


Progress: 100%|█████████████████████████████████████████| Time: 0:00:23
notattempted = 455
0.7254685777287762	0.5799911855442926	0.6446240509429342


In [89]:
(length(challenges) - 455)/length(challenges)


0.7994711326575584

In [None]:
evalute_on_wsd_challenge(challenges, solutions, am, db;
    normalise_over_context_length = true,
    normalize_over_prior = true,
    use_prior_for_alignment = false)
#(0.7860608734009704,0.7853680035257823,0.7857142857142858) #using first 5 words of gloss if word not found
#(0.7913542126157918,0.7906566769501984,0.791005291005291) #using whole gloss if word not found
#(0.7992942214380239,0.7985896870868224,0.798941798941799)  #using whole gloss always

In [None]:
evalute_on_wsd_challenge(challenges, solutions, am, db;
normalise_over_context_length = false,
normalize_over_prior = false,
    use_prior_for_alignment = true)
#(0.7741508601676224,0.7734684883208461,0.7738095238095237) #using whole gloss always

In [None]:
evalute_on_wsd_challenge(challenges, solutions, am, db;
normalise_over_context_length = false,
normalize_over_prior = false,
use_prior_for_alignment = false)
#(0.7741508601676224,0.7734684883208461,0.7738095238095237) #using whole gloss always

In [None]:
evalute_on_wsd_challenge(challenges, solutions, am, db;
    normalise_over_context_length = true,
    normalize_over_prior = true,
    use_prior_for_alignment = true)
#(0.7860608734009704,0.7853680035257823,0.7857142857142858)  #using first 5 words of gloss if word not found
#(0.7913542126157918,0.7906566769501984,0.791005291005291) #using whole gloss if word not found
#(0.7992942214380239,0.7985896870868224,0.798941798941799)  #using whole gloss always

In [None]:
pos(c::eltype(challenges)) = c.pos

f1_n = evalute_on_wsd_challenge(challenges[pos.(challenges).=='n'], solutions[pos.(challenges).=='n'], ee, db;
    normalise_over_context_length = true,
    normalize_over_prior = true,
    use_prior_for_alignment = false,
    MFS_backoff = true) |>  last
print("\t")

f1_v = evalute_on_wsd_challenge(challenges[pos.(challenges).=='v'], solutions[pos.(challenges).=='v'], ee, db;
    normalise_over_context_length = true,
    normalize_over_prior = true,
    use_prior_for_alignment = false,
    MFS_backoff = true) |>  last 
print("\t")

f1_a = evalute_on_wsd_challenge(challenges[pos.(challenges).=='a'], solutions[pos.(challenges).=='a'], ee, db;
    normalise_over_context_length = true,
    normalize_over_prior = true,
    use_prior_for_alignment = false,
    MFS_backoff = true) |>  last 
print("\t")

f1_r = evalute_on_wsd_challenge(challenges[pos.(challenges).=='r'], solutions[pos.(challenges).=='r'], ee, db;
    normalise_over_context_length = true,
    normalize_over_prior = true,
    use_prior_for_alignment = false,
    MFS_backoff = true) |>  last 
print("\n\n\n\n")
print(f1_n,"\t")
print(f1_v,"\t")
print(f1_a,"\t")
print(f1_r,"\t")

In [None]:
ch

In [None]:

0.7818181818181819	0.7540425531914893	0.779184247538678	0.8888888888888888
		

println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, am, db))

#println("only nouns  : \t\t", 
#@show nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), am, db)
#println("only verbs  : \t\t", 
#@show vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), am, db)
#println("only adjecti: \t\t", 
#@show aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), am, db)
#println("only adverbs: \t\t", 
#@show rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), am, db)

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, ee, db))

#println("only nouns  : \t\t", 
#@show nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), am, db)
#println("only verbs  : \t\t", 
#@show vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), am, db)
#println("only adjecti: \t\t", 
#@show aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), am, db)
#println("only adverbs: \t\t", 
#@show rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), am, db)

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, s_am, db))

#println("only nouns  : \t\t", 
#@show nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), s_am, db)
#println("only verbs  : \t\t", 
#@show vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), s_am, db)
#println("only adjecti: \t\t", 
#@show aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), s_am, db)
#println("only adverbs: \t\t", 
#@show rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), s_am, db)

# "models/adagram/more_senses.adagram_model.jld
### with Adagram Disambig weighting (prior used)
### with SemCor data 
overall: 		(0.655408489274304,0.6328779197884531,0.6439461883408072)
- nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'),only_of_pos(solutions,'n'),am,db) = (0.7161410018552876,0.6967509025270758,0.706312900274474)
- vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'),only_of_pos(solutions,'v'),am,db) = (0.5106382978723404,0.4873096446700508,0.4987012987012987)
- aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'),only_of_pos(solutions,'a'),am,db) = (0.6619718309859155,0.649171270718232,0.6555090655509065)
- rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'),only_of_pos(solutions,'r'),am,db) = (0.7268041237113402,0.6778846153846154,0.7014925373134329)

### with Adagram Disambig weighting (prior used)
### with just glosses 

overall: 		(0.6704701049748973,0.6474217717055972,0.658744394618834)
- nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'),only_of_pos(solutions,'n'),am,db) = (0.7133580705009277,0.694043321299639,0.7035681610247027)
- vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'),only_of_pos(solutions,'v'),am,db) = (0.5780141843971631,0.5516074450084603,0.5645021645021645)
- aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'),only_of_pos(solutions,'a'),am,db) = (0.6535211267605634,0.6408839779005525,0.6471408647140865)
- rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'),only_of_pos(solutions,'r'),am,db) = (0.7319587628865979,0.6826923076923077,0.7064676616915424)



# "semhuff_more_senses.adagram_model.jld

### with Adagram Disambig weighting (prior used)
### with just glosses 
 - overall: 		(0.6704701049748973,0.6474217717055972,0.658744394618834)
 - nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'),only_of_pos(solutions,'n'),am,db) = (0.7133580705009277,0.694043321299639,0.7035681610247027)
 - vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'),only_of_pos(solutions,'v'),am,db) = (0.5780141843971631,0.5516074450084603,0.5645021645021645)
 - aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'),only_of_pos(solutions,'a'),am,db) = (0.6535211267605634,0.6408839779005525,0.6471408647140865)
 - rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'),only_of_pos(solutions,'r'),am,db) = (0.7319587628865979,0.6826923076923077,0.7064676616915424)
 
 ### with Adagram Disambig weighting (prior used)
 ### With semcore
 - overall: 		(0.6134185303514377,0.5923314235345968,0.6026905829596412)
 - nn = evalute_on_wsd_challenge(only_of_pos(challenges,'n'),only_of_pos(solutions,'n'),s_am,db) = (0.6252319109461967,0.6083032490974729,0.6166514181152789)
 - vv = evalute_on_wsd_challenge(only_of_pos(challenges,'v'),only_of_pos(solutions,'v'),s_am,db) = (0.5638297872340425,0.5380710659898477,0.5506493506493505)
 - aa = evalute_on_wsd_challenge(only_of_pos(challenges,'a'),only_of_pos(solutions,'a'),s_am,db) = (0.6169014084507042,0.6049723756906077,0.610878661087866)
 - rr = evalute_on_wsd_challenge(only_of_pos(challenges,'r'),only_of_pos(solutions,'r'),s_am,db) = (0.6855670103092784,0.6394230769230769,0.6616915422885572)
 
 
 # Most Frequent Sense
 
  -  overall: 		(0.7783164389598942,0.7783164389598942,0.7783164389598942)
  - only nouns  : 		(0.7653429602888087,0.7653429602888087,0.7653429602888087)
  - only verbs  : 		(0.7529610829103215,0.7529610829103215,0.7529610829103215)
  - only adjecti: 		(0.8038674033149171,0.8038674033149171,0.8038674033149171)
  - only adverbs: 		(0.875,0.875,0.875)


In [None]:
println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, ee, db))

#println("only nouns  : \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), ee, db))
#println("only verbs  : \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), ee, db))
#println("only adjecti: \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), ee, db))
#println("only adverbs: \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), ee, db))

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge_MFS(challenges, solutions, db))

#println("only nouns  : \t\t", evalute_on_wsd_challenge_MFS(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), db))
#println("only verbs  : \t\t", evalute_on_wsd_challenge_MFS(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), db))
#println("only adjecti: \t\t", evalute_on_wsd_challenge_MFS(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), db))
#println("only adverbs: \t\t", evalute_on_wsd_challenge_MFS(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), db))


In [None]:
#Zero shot WSI for 1 shot WSD
using Training

In [None]:
rr = FixedWordSenseEmbedding(ee.dimension, random_inited, huffman_tree; initial_nsenses=50)
rr.corpus_size = ee.corpus_size
rr.distribution = ee.distribution
rr.codebook = ee.codebook
rr.classification_tree = ee.classification_tree
Training.initialize_embedding(rr);

In [None]:
rr.embedding["us"] |> length

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, rr, db))

println("only nouns  : \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), rr, db))
println("only verbs  : \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), rr, db))
println("only adjecti: \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), rr, db))
println("only adverbs: \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), rr, db))

In [None]:
hh = deepcopy(rr)
for word in keys(rr.embedding)
    append!(hh.embedding[word], ee.embedding[word])
end

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, hh, db))

println("only nouns  : \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'n'), only_of_pos(solutions,'n'), hh, db))
println("only verbs  : \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'v'), only_of_pos(solutions,'v'), hh, db))
println("only adjecti: \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'a'), only_of_pos(solutions,'a'), hh, db))
println("only adverbs: \t\t", evalute_on_wsd_challenge(only_of_pos(challenges,'r'), only_of_pos(solutions,'r'), hh, db))

In [10]:
reload("CorpusLoaders")

In [5]:
mapping_corpus = CorpusLoaders.lazyload_semcor("data/corpora/semcor2.1/brown1/tagfiles/", 10) |> collect;
append!(mapping_corpus, CorpusLoaders.lazyload_semcor("data/corpora/semcor2.1/brown2/tagfiles/", 10)|> collect);

In [14]:
function identify(::PosTaggedWord, wn::DB,ee)
    throw(KeyError("No sense in a PosTaggedWord"))
end

function identify(taggedword::SenseAnnotatedWord, wn::DB,ee)
    pos = pennPOStoWordNetPOS(taggedword.pos)
    wn_sensekeys = sensekeys(wn, wn[pos, taggedword.lemma])
    nsenses = length(wn_sensekeys)
    
    target_wnsn = findfirst(wn_sensekeys .== sensekey(taggedword))
    target_wnsn==0 && throw(KeyError("$(sensekey(taggedword)) not in  $(wn_sensekeys).\n\nIssue is with $taggedword"))

    wvs = all_word_sense_vectors(ee,taggedword.word, taggedword.lemma)
    priors = all_word_sense_priors(ee,taggedword.word, taggedword.lemma)
    length(wvs) == 0 && throw(KeyError("No embedding for $(taggedword.word); skipping"))
    
    target_wnsn, nsenses, pos, wvs, priors
end

identify (generic function with 2 methods)

In [15]:
function agirresAlignment(ee, wn::DB, mapping_corpus, hard=true)
    maps = Dict{Tuple{String, String, Char}, Matrix{Float32}}()
    freqs = Dict{Tuple{String, String, Char}, Vector{Int}}()
    
    function proc_word(word::PosTaggedWord, sentence)
    end
    
    function proc_word(taggedword::SenseAnnotatedWord, sentence)
        local target_wnsn, nsenses, pos, wvs, priors
        try
            target_wnsn, nsenses, pos, wvs, priors = identify(taggedword, wn::DB,ee)
        catch err
            typeof(err)<:KeyError ||rethrow(ee)
            return
        end
        ########
        
        map = get!(maps, (taggedword.word, taggedword.lemma, pos)) do
            zeros(length(wvs), nsenses)
        end
        
        freq = get!(freqs, (taggedword.word, taggedword.lemma, pos)) do 
            zeros(Int, nsenses)
        end
        
        context = lowercase.(strip_tags(sentence))
        wv_probs = general_wsd(ee,context, wvs, priors)
        @assert(length(wv_probs) == length(wvs))
        @assert sum(wv_probs) ≈ 1f0
        @assert !any(isnan.(wv_probs))
        freq[target_wnsn] += 1
        if hard
            map[indmax(wv_probs),target_wnsn] += 1
        else
            @assert(length(map[:,target_wnsn]) == length(wv_probs),
            "$(length(map[:,target_wnsn])) != $(length(wv_probs)) for \"$(taggedword.lemma)\""           
            )
            map[:,target_wnsn] += wv_probs
        end
        
    end
    
    @showprogress for (word, sentence) in mapping_corpus
        proc_word(word, sentence)
    end
 
    
    @showprogress for ((word, lem, pos), freq) in freqs
        mm = maps[(word, lem, pos)]
        mm ./= freq'
        mm[isnan.(mm)] = 0f0 #NaNs are just frequency 0 items
        @assert(all(isapprox.(sum(mm, 1), 1f0; atol=1f-5) 
                    | isapprox.(sum(mm, 1), 0f0; atol=1f-5)), 
                    "($word , $pos) not sum to one, $(sum(mm,1))")
    end

    maps, freqs
end

agirresAlignment (generic function with 2 methods)

In [74]:
nobackoff(challenge, ee, wn) = Nullable{String}()
MSF_backoff(challenge, ee, wn) = most_frequent_sense(challenge, wn)
refitting_backoff(challenge, ee, wn) = supervised_wsd(challenge, ee, wn::DB; 
    normalise_over_context_length =true,
    normalize_over_prior =true,
    use_prior_for_alignment = true,
    MFS_backoff = false
)

function mapped_wsd(challenge, ee, wn::DB, maps::Associative{Tuple{String, String, Char}, Matrix{Float32}};
                    backoff = nobackoff)
    l_smoothing = 1
    l_c_smoothing = 1
    try
        lem = wn[challenge.lemma, challenge.pos]
        target_synsets = synsets(wn, lem)
        
        wvs = all_word_sense_vectors(ee, challenge.word)
        length(wvs) > 0 || throw(KeyError("No embeddings for "*challenge.word))
        
        p_l = Float32[sensecount(db, l_i, lem) for l_i in target_synsets]
        p_l .+= l_smoothing
        p_l./=sum(p_l)

        p_u_c = general_wsd(ee,challenge.context, wvs)

        pseudo_p_l_uc = maps[challenge.word, challenge.lemma, challenge.pos]
        p_l_c = vec(p_u_c'*pseudo_p_l_uc)
        p_l_c.+= -1*min(minimum(p_l_c),0.0) + l_c_smoothing
        p_l_c./=sum(p_l_c)
        @assert(all(p_l_c.>=0), p_l_c)
        
        probs_of_sense = sqrt(p_l_c.*p_l)
        
        sense_index= indmax(probs_of_sense)        
        synset = target_synsets[sense_index]
        sk = sensekey(wn, synset, lem)
        Nullable(sk)
    catch ex
        isa(ex, KeyError) || rethrow(ex)
        backoff(challenge,ee,wn)
    end   
end

function evalute_on_wsd_challenge(challenges, solutions, ee, wn::DB, maps; backoff=nobackoff)
    method = challenge -> mapped_wsd(challenge, ee, wn, maps; backoff=backoff)
    evalute_on_wsd_challenge(challenges, solutions, method)
end

evalute_on_wsd_challenge (generic function with 3 methods)

In [48]:
maps_hard_am, freqs  = agirresAlignment(am, db, mapping_corpus, true);

Progress:   0%|                                         |  ETA: 0:34:27

LoadError: LoadError: InterruptException:
while loading In[48], in expression starting on line 1

In [75]:
println("overall: \t\t", evalute_on_wsd_challenge(
challenges, solutions, am, db, maps_hard_am;
backoff = refitting_backoff
))

#maps_soft(am) overall: 		(0.6897311591009255,0.6897311591009255,0.6897311591009255
#maps_hard(am) overall: 		(0.7814014984574702,0.7814014984574702,0.7814014984574703)

Progress:  20%|████████                                 |  ETA: 0:00:26ex = KeyError(" SubString{String}[\"noncompetitively\"], nor SubString{String}[\"noncompetitively\"] have embeddings")
Progress:  25%|██████████                               |  ETA: 0:00:24ex = KeyError(" SubString{String}[\"semiliterate\"], nor SubString{String}[\"semiliterate\"] have embeddings")
Progress: 100%|█████████████████████████████████████████| Time: 0:00:34
notattempted = 2
overall: 		(0.78429642699603,0.7836051123843103,0.7839506172839507)


In [77]:
Int(95//100 * 100)

95

In [37]:
(length(challenges) - 356)/length(challenges)

0.8431026884089907

In [None]:
function matrix_from_rows{T}(Xs::Vector{Vector{T}})
    ret = Matrix{T}(length(Xs), length(first(Xs)))
    for (ii,row) in enumerate(Xs)
        @inbounds ret[ii,:] = row
    end
    ret
end

function bestfact!(x)
    if size(x,1)<size(x,2)
        svdfact!(x)
    elseif size(x,1)>=size(x,2)
        qrfact!(x, Val{true})
    #else
    #    @assert(size(x,1)==size(x,2))
    #    lufact!(x)
    end
end

function leastSquaresAlignment(ee, wn::DB, mapping_corpus, hard=true)
    vars = Dict{Tuple{String, String, Char}, Tuple{Vector{Vector{Float32}}, Vector{Vector{Float32}}}}()
    
    function proc_word(word::TaggedWord, sentence::TaggedSentence)
    end
    
    function proc_word(taggedword::SenseAnnotatedWord, sentence::TaggedSentence)
        local target_wnsn, nsenses, pos, wvs
        try
            target_wnsn, nsenses, pos, wvs = identify(taggedword, wn::DB,ee)
        catch err
            typeof(err)<:KeyError ||rethrow(ee)
            return
        end
        ########
        
        Xs, Ys = get!(vars, (taggedword.word, taggedword.lemma, pos)) do
            Vector{Vector{Float32}}(), Vector{Vector{Float32}}()
        end    
        
        context = lowercase.(strip_tags(sentence))
        wv_probs = general_wsd(ee,context, wvs)
        @assert(length(wv_probs) == length(wvs))
        @assert sum(wv_probs) ≈ 1f0
        @assert !any(isnan.(wv_probs))
        push!(Xs, wv_probs)
        y = zeros(nsenses)
        y[target_wnsn] = 1f0
        push!(Ys, y)
    end
    
    @showprogress for sentence in mapping_corpus
        for word in sentence
            proc_word(word, sentence)
        end
    end
 
    maps = Dict{Tuple{String, String, Char}, Matrix{Float32}}()
    
    @showprogress for (key, (Xs,Ys)) in vars
        X = matrix_from_rows(Xs)
        Y = matrix_from_rows(Ys)
        try 
            maps[key] = bestfact!(X)\Y
        catch err
            @show X
            println("-"^30)
            @show Y
            rethrow(err)
        end    
    end

    maps
end

In [None]:
maps_ls_am = leastSquaresAlignment(am, db, mapping_corpus);

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge(challenges, solutions, am, db, maps_ls_am))
#am, db, maps_ls_am overall: 		(0.753195240193918,0.753195240193918,0.7531952401939179)

In [None]:
split("it was a brutal war by all accounts")
word = civil 

In [None]:
#maps = deepcopy(maps_ls_am);


p_u_c = general_wsd(am, split("our regular lunchtime was bread and cheese"), all_word_sense_vectors(am,key[1]))
v = mm'*p_u_c
v./=sum(v)
v.*= p_l
@show v

In [None]:

key, mm = drop(maps,500) |> first |> deepcopy 
mm+=1
@show key
@show size(mm)
ss = synsets(db, db[key[2:end]...])
display(ss)

p_l = Float32[sensecount(db, ss_i, db[key[2:end]...]) for ss_i in ss]
p_l +=1
p_l./=sum(p_l)
display(p_l)

p_u_c = general_wsd(am, split("it was a very expensive for the taxi"), all_word_sense_vectors(am,key[1]))
v = p_u_c'*mm
v./=sum(v)
vec(v).*p_l


In [None]:
function hybrid_wsd(challenge, ee, wn::DB, maps::Associative{Tuple{String, String, Char}, Matrix{Float32}})

end

function evalute_on_wsd_challenge_hybrid(challenges, solutions, ee, wn::DB, maps)
    method = challenge -> hybrid_wsd(challenge, ee, wn, maps)
    evalute_on_wsd_challenge(challenges, solutions, method)
end

In [None]:
println("overall: \t\t", evalute_on_wsd_challenge_hybrid(challenges, solutions, am, db, maps_ls_am))
#am, db, maps_ls_am overall: 		(0.753195240193918,0.753195240193918,0.7531952401939179)