In [2]:
using WordNet
using WordEmbeddings, SoftmaxClassifier
using Utils
using Query
using Distances
using Iterators
using JLD
using Trees
using AbstractTrees
using BlossomV

In [19]:
@everywhere  using AdaGram


In [3]:
ee = load("../eval/models/plain/tokenised_lowercase_WestburyLab.wikicorp.201004_50__i1.jld","ee");


In [4]:
semtree = JLD.load("semtree.jld", "semtree");

In [37]:
ee.subsampling

1.0f-5

In [15]:
addprocs(11)

11-element Array{Int64,1}:
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12

In [38]:
load("../eval/models/adagram/more_senses.params.jld")

Dict{String,Any} with 18 entries:
  "prototypes" => 30
  "nprocessors" => 13
  "output_fn" => "../models/adagram/more_senses.adagram_model"
  "sense_treshold" => 1.0e-10
  "remove_top_k" => 0
  "context_cut" => true
  "initcount" => 1.0
  "train_fn" => "../data/corpora/WikiCorp/tokenised_lowercase_WestburyLab.wikicorp.201004.txt"
  "d" => 0.0
  "alpha" => 0.25
  "subsample" => 1.0e-5
  "epochs" => 1
  "window" => 10
  "min_freq" => 20
  "save_treshold" => 0.0
  "dim" => 100
  "stopwords" => Set{AbstractString}()
  "dict_fn" => "../data/corpora/WikiCorp/tokenised_lowercase_WestburyLab.wikicorp.201004.1gram"

In [None]:
ee.subsampling

In [None]:
base_name = "semhuff_more_senses"
param_save_fn =  "../eval/models/adagram/$(base_name).params.jld"
output_fn = "../eval/models/adagram/$(base_name).adagram_model"#"file to save the model (in Julia format)"
@assert !isfile(output_fn)
@param_save param_save_fn begin
	nprocessors = nprocs()
	train_fn  =  "../eval/data/corpora/WikiCorp/tokenised_lowercase_WestburyLab.wikicorp.201004.txt" #"training text data"
	output_fn = output_fn #file to save the model (in Julia format)"
    semsimsource_fn = "../eval/models/plain/tokenised_lowercase_WestburyLab.wikicorp.201004_50__i1.jld"

	window = 10 #"(max) window size" C in the paper
	min_freq  = 20 #"min. frequency of the word"
	remove_top_k = 0 #"remove top K most frequent words"
	dim  = 100 #"dimensionality of representations"
	prototypes = 30 #"number of word prototypes" T in the paper
	alpha = 0.25 #"prior probability of allocating a new prototype"
	d  = 0.0 #"parameter of Pitman-Yor process" D in paper
	subsample = 1e-5 #"subsampling treshold. useful value is 1e-5"
	context_cut  = true #"randomly reduce size of the context"
	epochs = 1 #"number of epochs to train"
	initcount = 1. #"initial weight (count) on first sense for each word"
	stopwords = Set{AbstractString}() #"list of stop words"
	sense_treshold = 1e-10 #"minimal probability of a meaning to contribute into gradients"
	save_treshold = 0.0 #"minimal probability of a meaning to save after training"
end

In [22]:
vm, dict = semhuff_initialize_AdaGram(semtree, 300, 5; alpha=0.15, d=0.0);

In [None]:
inplace_train_vectors!(vm, dict, train_fn, window;
                       threshold=subsample, context_cut=context_cut,
					   epochs=epochs, init_count=initcount, sense_treshold=sense_treshold)

	From worker 2:	64000 words read, 1443333/541936072
	From worker 3:	64000 words read, 543495711/1083872144
	From worker 4:	64000 words read, 1085336468/1625808216
	From worker 5:	64000 words read, 1627306849/2167744288
	From worker 6:	64000 words read, 2169224390/2709680360
	From worker 7:	64000 words read, 2711151461/3251616432
	From worker 8:	64000 words read, 3253113250/3793552504
	From worker 9:	64000 words read, 3795047471/4335488576
	From worker 10:	64000 words read, 4337017677/4877424648
	From worker 11:	64000 words read, 4878935567/5419360720
	From worker 12:	64000 words read, 5420892477/5961296792
	From worker 2:	0.22% -10.5548 0.0249 0.0249 2.06/4.00 0.55 kwords/sec
	From worker 5:	0.22% -10.5549 0.0249 0.0249 2.13/4.00 0.56 kwords/sec
	From worker 4:	0.23% -10.5568 0.0249 0.0249 2.08/4.00 0.55 kwords/sec
	From worker 3:	0.23% -10.5564 0.0249 0.0249 2.07/4.00 0.53 kwords/sec
	From worker 6:	0.23% -10.5578 0.0249 0.0249 2.03/4.00 0.55 kwords/sec
	From worker 7:	0.23% -10.5581 

In [34]:
word = "fried"
prior_probs = expected_pi(vm, dict.word2id[word])
for ii in 1:5
    println(prior_probs[ii],"\t",nearest_neighbors(vm, dict, word, ii, 10))
    println()
end

0.4769232204103473	Tuple{AbstractString,Int64,Float32}[("potatoes",1,0.818664),("mashed",1,0.818284),("eaten",1,0.812375),("dishes",1,0.80757),("cooked",1,0.804335),("ingredients",1,0.803254),("spices",1,0.802813),("onions",1,0.801898),("beef",1,0.7977),("beans",2,0.791067)]

0.35301754921391737	Tuple{AbstractString,Int64,Float32}[("dishes",1,0.57407),("vegetable",1,0.56374),("gravy",1,0.561604),("ingredients",1,0.559226),("meat",1,0.55478),("spices",1,0.553243),("potatoes",1,0.552766),("flour",1,0.552601),("soup",1,0.550143),("corn",1,0.547027)]

0.17003412234580592	Tuple{AbstractString,Int64,Float32}[("unfaithful",2,0.54388),("noisy",3,0.531049),("sucks",2,0.522041),("mentality",2,0.504103),("joes",2,0.50138),("jimmy's",2,0.501343),("emancipation",2,0.501151),("specks",2,0.499993),("satanic",2,0.499443),("pastiche",2,0.498762)]

2.1832257134337443e-5	Tuple{AbstractString,Int64,Float32}[("sealed",1,0.227182),("cfm",1,0.221162),("irrigate",1,0.220692),("primer",1,0.220021),("conditioni

In [35]:
using AdaGramCompat

In [36]:
am = AdaGramModel(vm, dict)
JLD.save(output_fn*".jld", "am", am)

In [None]:
import WordEmbeddings: NetworkType

type SemHuffNetwork <: NetworkType
    semtree::GenWordEmbedding
end

In [None]:
using Training
using PooledElements

In [41]:
semtree |> typeof

Trees.BranchNode

In [None]:
function initialize_network!(embed::GenWordEmbedding, network_type::SemHuff)
    embed.classification_tree = transform_tree(semtree, 
                            leaf_transform = word->word,
    internal_transform = dummy -> LinearClassifier(2,embed.dim))
    
    embed.codebook = Dict(leaves_of(classification_tree))
    debug("Completed SemHuff Bootstrapping")
    embed
end

In [None]:
sem_ee = deepcopy(ee)
sem_ee.network_type = SemHuff(ee)
sem_ee.embedding = Dict(pstring(word)=>wv for (word, similar(wv)) in ee.embedding)
initialize_embedding(sem_ee,sem_ee.init_type)
initialize_network!(sem_ee,sem_ee.network_type)