In [2]:
using WordNet
using WordEmbeddings, SoftmaxClassifier
using Utils
using Query
using Distances
using Iterators
using JLD
using Trees
using AbstractTrees
using BlossomV

In [19]:
@everywhere  using AdaGram


In [3]:
ee = load("../eval/models/plain/tokenised_lowercase_WestburyLab.wikicorp.201004_50__i1.jld","ee");


In [4]:
semtree = JLD.load("semtree.jld", "semtree");

In [20]:
function semhuff_initialize_AdaGram(semtree::Trees.BranchNode, dim::Integer, num_meanings::Integer; alpha::Float64=0.01, d::Float64=0.0)
    paths = Dict(node.data => path for (node, path) in Trees.get_paths(semtree))
        
    codes = Dict(w=> convert(Vector{Int8}, oc - 1) for (w, oc) in leaves_of(semtree))
    dict = AdaGram.Dictionary(convert(Vector{AbstractString},collect(keys(paths))))
    freqs = Vector{Int64}(length(codes))
    huffman_outputs = Vector{AdaGram.HierarchicalOutput}(length(codes))
    for word in dict.id2word
        id = dict.word2id[word]
        freqs[id] = round(Int64,ee.distribution[word] * ee.corpus_size) #Todo: The math on this is not quiet right, because subsampling could have messed with the Corpus Size

        huffman_outputs[id] = AdaGram.HierarchicalOutput(codes[word], paths[word])
    end;
        
    vm = AdaGram.VectorModel(freqs, dim, num_meanings, alpha,d, huffman_outputs)
    vm, dict
end



semhuff_initialize_AdaGram (generic function with 1 method)

In [15]:
addprocs(11)

11-element Array{Int64,1}:
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12

In [None]:
base_name = "semhuff_v1"
param_save_fn =  "../eval/models/adagram/$(base_name).params.jld"
output_fn = "../eval/models/adagram/$(base_name).adagram_model"#"file to save the model (in Julia format)"
@assert !isfile(output_fn)
@param_save param_save_fn begin
	nprocessors = nprocs()
	train_fn  =  "../eval/data/corpora/WikiCorp/tokenised_lowercase_WestburyLab.wikicorp.201004.txt" #"training text data"
	output_fn = output_fn #file to save the model (in Julia format)"
    semsimsource_fn = "../eval/models/plain/tokenised_lowercase_WestburyLab.wikicorp.201004_50__i1.jld"

	window = 10 #"(max) window size" C in the paper
	min_freq  = 20 #"min. frequency of the word"
	remove_top_k = 0 #"remove top K most frequent words"
	dim  = 300 #"dimensionality of representations"
	prototypes = 5 #"number of word prototypes" T in the paper
	alpha = 0.15 #"prior probability of allocating a new prototype"
	d  = 0.0 #"parameter of Pitman-Yor process" D in paper
	subsample = 1e-5 #"subsampling treshold. useful value is 1e-5"
	context_cut  = true #"randomly reduce size of the context"
	epochs = 1 #"number of epochs to train"
	initcount = 1. #"initial weight (count) on first sense for each word"
	stopwords = Set{AbstractString}() #"list of stop words"
	sense_treshold = 1e-10 #"minimal probability of a meaning to contribute into gradients"
	save_treshold = 0.0 #"minimal probability of a meaning to save after training"
end

In [22]:
vm, dict = semhuff_initialize_AdaGram(semtree, 300, 5; alpha=0.15, d=0.0);

In [None]:
inplace_train_vectors!(vm, dict, train_fn, window;
                       threshold=subsample, context_cut=context_cut,
					   epochs=epochs, init_count=initcount, sense_treshold=sense_treshold)

	From worker 2:	64000 words read, 1443333/541936072
	From worker 3:	64000 words read, 543495711/1083872144
	From worker 4:	64000 words read, 1085336468/1625808216
	From worker 5:	64000 words read, 1627306849/2167744288
	From worker 6:	64000 words read, 2169224390/2709680360
	From worker 7:	64000 words read, 2711151461/3251616432
	From worker 8:	64000 words read, 3253113250/3793552504
	From worker 9:	64000 words read, 3795047471/4335488576
	From worker 10:	64000 words read, 4337017677/4877424648
	From worker 11:	64000 words read, 4878935567/5419360720
	From worker 12:	64000 words read, 5420892477/5961296792
	From worker 2:	0.22% -10.5548 0.0249 0.0249 2.06/4.00 0.55 kwords/sec
	From worker 5:	0.22% -10.5549 0.0249 0.0249 2.13/4.00 0.56 kwords/sec
	From worker 4:	0.23% -10.5568 0.0249 0.0249 2.08/4.00 0.55 kwords/sec
	From worker 3:	0.23% -10.5564 0.0249 0.0249 2.07/4.00 0.53 kwords/sec
	From worker 6:	0.23% -10.5578 0.0249 0.0249 2.03/4.00 0.55 kwords/sec
	From worker 7:	0.23% -10.5581 

In [None]:
import WordEmbeddings: NetworkType

type SemHuff <: NetworkType
    source::GenWordEmbedding
end

In [None]:
using Training
using PooledElements

In [None]:
function initialize_network!(embed::GenWordEmbedding, network_type::SemHuff)
    source_tree = network_type.source.classification_tree
    source_embeddings = network_type.source.embedding
    
    debug("Began SemHuff sorting")
    semtree = semhuff(source_tree, source_embeddings, 30);
    debug("Completed SemHuff sorting")
    debug("Began classification tree creation")
    embed.classification_tree = transform_tree(semtree, 
                            leaf_transform = word->word,
    internal_transform = dummy -> LinearClassifier(2,embed.dim))
    
    embed.codebook = Dict(leaves_of(classification_tree))
    debug("Completed SemHuff Bootstrapping")
    embed
end

In [None]:
sem_ee = deepcopy(ee)
sem_ee.network_type = SemHuff(ee)
sem_ee.embedding = Dict(pstring(word)=>wv for (word, wv) in ee.embedding)
initialize_embedding(sem_ee,sem_ee.init_type)
initialize_network!(sem_ee,sem_ee.network_type)