In [None]:
using WordNet
using WordEmbeddings, SoftmaxClassifier
using Utils
using Query
using Distances
using Iterators
using NearestNeighbors
using JLD
using Trees
using AbstractTrees
using BlossomV

In [None]:
ee = load("../eval/models/plain/tokenised_lowercase_WestburyLab.wikicorp.201004_50__i1.jld","ee");


In [None]:
using Lumberjack
remove_truck("console")
add_truck(LumberjackTruck(STDOUT, "info", Dict{Symbol,Any}(:is_colorized => true)), "console")
add_truck(LumberjackTruck("semhuff_ipynb.log"), "file-logger")

In [None]:
#run(`tail semhuff_ipynb.log`)

In [None]:
import BlossomV.dense_num_edges
"""
dist_func must resturn a values between 0.0 and 1.0. 
result is a matrix where each column is a pairing of items, expressed as there index (1 indexed)
"""
function most_similar_pairings(dist_func::Function, items, consider_nearest_n::Integer)
    m = Matching(length(items), min(dense_num_edges(length(items)), length(items)*consider_nearest_n))
    #sims = Matrix{Int32}(length(items),length(items)).*Inf #for Debug purposes
    for ii in (1:length(items)-1)
        jjs = ii+1:length(items)
        dists = [dist_func(items[ii], items[jj]) for jj in jjs]
        nearest_jjs = if consider_nearest_n < length(jjs)
            jjs[selectperm(dists, 1:consider_nearest_n)]
        else
            jjs
        end
            
        for (jj,dist) in zip(nearest_jjs,dists)
            @assert dist<=1.0
            scale = typemax(Int32)>>4
            approx_dist = round(Int32, dist*scale)
            #sims[ii,jj]=approx_dist
            #println(join([m, ii-1, jj-1, approx_dist],"\t"))
            add_edge(m, ii-1,jj-1 , approx_dist)
        end
    end
    solve(m)
    get_all_matches(m, length(items)) .+ 1    
end

In [None]:
"Returns a new tree, with the same structure but different values for the data"
function transform_tree(node::BranchNode; leaf_transform=identity, internal_transform=identity)
    function treemap_inner(node, leaf_transform, internal_transform)
        data = Trees.isleaf(node) ? leaf_transform(node.data) : internal_transform(node.data)
        children = BranchNode[treemap_inner(child, leaf_transform, internal_transform)  for child in node.children]
        new_node = BranchNode(copy(children), data)
    end
    treemap_inner(node, leaf_transform, internal_transform)
end
    

In [None]:
#TODO make this a testcase
x = BranchNode(
[BranchNode([],"11"),BranchNode([
        BranchNode([],"121"),BranchNode([],"122"),
        ],"12")],
    "1"
)
print_tree(STDOUT, x)

println()
y=transform_tree(x, leaf_transform = word->"L"*word, internal_transform = dummy -> "x"*dummy)
print_tree(STDOUT, y)

In [None]:
"""
Assumes that the `tree` is already a Huffman tree.
"""
function semhuff(classification_tree, embeddings, consider_nearest_n)
    embedding_dim = length(first(embeddings))
    sim_tree = transform_tree(classification_tree, 
                            leaf_transform = word->word,
                            internal_transform = dummy -> "")
    
    nodes_at_depth, codes_at_depth = levels(sim_tree)
    maxdepth = length(nodes_at_depth)
    
    
    embeds = [embeddings[node.data] for node in nodes_at_depth[maxdepth]]
    #Dict(code => embeddings[node.data] for (code, node) in zip(codes_at_depth[maxdepth], nodes_at_depth[maxdepth]))
    for depth in maxdepth:-1:2
        info("semantically sorting level: $depth")
        nodes = nodes_at_depth[depth]
        
        pair_indexes = most_similar_pairings(Query.angular_dist, embeds, consider_nearest_n)
        
        #We will now, assign the new nodes to parents in arbitary order
        nodes_above = nodes_at_depth[depth-1]
        embeds_above = typeof(embeds)(length(nodes_above))
        pair_jj = 1

        for (above_ii,node_above) in enumerate(nodes_above)

            if Trees.isleaf(node_above)
                embeds_above[above_ii] = embeddings[node_above.data]
            else
                #It is a branch so put a pair of nodes here
                @assert (length(node_above.children) == 2)
                child_index1  = pair_indexes[1,pair_jj]
                child_index2  = pair_indexes[2,pair_jj]
                pair_jj += 1
                node_above.children[1] = nodes[child_index1]
                node_above.children[2] = nodes[child_index2]
                embeds_above[above_ii] = (embeds[child_index1] + embeds[child_index2])/2.0
                #@show nodes[child_index1], nodes[child_index2]
            end
        end
        @assert(pair_jj - 1  == size(pair_indexes,2), "$(pair_jj) != $(length(pair_indexes)) + 1") #All pairs must be assigned.
            
        embeds = embeds_above
    end
    sim_tree
end
    
    

In [25]:
midi_tree = ee.classification_tree[1][1][1][1][1][1][1][1][1][1]
@time semtree = semhuff(midi_tree, ee.embedding, 30);
#print_tree(STDOUT, semtree)

2016-09-02T00:18:37.769 - info: semantically sorting level: 11
perfect matching with 2 nodes and 1 edges
    starting init...done [0.000 secs]. 0 trees
    .
done [0.000 secs]. 0 grows, 0 expands, 0 shrinks
    expands: [0.000 secs], shrinks: [0.000 secs], dual updates: [0.000 secs]
2016-09-02T00:18:37.773 - info: semantically sorting level: 10
perfect matching with 2 nodes and 1 edges
    starting init...done [0.000 secs]. 0 trees
    .
done [0.000 secs]. 0 grows, 0 expands, 0 shrinks
    expands: [0.000 secs], shrinks: [0.000 secs], dual updates: [0.000 secs]
2016-09-02T00:18:37.775 - info: semantically sorting level: 9
perfect matching with 8 nodes and 28 edges
    starting init...done [0.000 secs]. 0 trees
    .
done [0.000 secs]. 0 grows, 0 expands, 0 shrinks
    expands: [0.000 secs], shrinks: [0.000 secs], dual updates: [0.000 secs]
2016-09-02T00:18:37.779 - info: semantically sorting level: 8
perfect matching with 18 nodes and 153 edges
    starting init...done [0.000 secs]. 2 

Trees.BranchNode with 2 children. data = ""

In [None]:
open("tree.txt","w") do fp
    print_tree(fp, semtree)
end

In [31]:
using Trees
using Trees.isleaf


In [37]:
function get_paths(root)
    function inner(node, id::Int32, path::Vector{Int32})
        if isleaf(node)
            produce(node.data, path)
        else
            for child in node.children
                inner(child, id+one(Int32), [path;id]) # This will make a copy of the path
            end
        end
    end
    @task inner(root, one(Int32), Int32[])
end



get_paths (generic function with 1 method)

In [57]:
npaths = Dict(get_paths(semtree))
ncodes = Dict(w=> convert(Vector{Int8}, oc - 1) for (w, oc) in leaves_of(semtree))
ndict = AdaGram.Dictionary(convert(Vector{AbstractString},collect(keys(npaths))))
freqs = Vector{Int32}(length(ncodes))
for word in ndict.id2word
    freqs[ndict.word2id[word]] = round(Int32,ee.distribution[word] * ee.corpus_size) #Todo: The math on this is not quiet right, because subsampling could have messed with the Corpus Size

end

41-element Array{Int32,1}:
  35994
  72005
   4498
   2249
 144010
   2249
    281
   2249
   1125
   1124
    281
   1125
   2250
   1125
   2249
  17995
  71989
   2250
   2250
   2249
   4500
   4498
   1125
   8997
   1125
   2249
   2249
    563
  71967
   2249
   4498
   4498
   1125
   4500
   4500
  72003
   4499
   2250
   4497
   2250
   2249

In [13]:
using AdaGram
vm, dict = load_model("../eval/models/adagram/v1_d100.adagram_model");

In [22]:
wid = dict.word2id["fire"]
@show vm.code[:,wid]

vm.code[:,wid] = Int8[1,1,1,1,1,1,0,0,0,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]


26-element Array{Int8,1}:
  1
  1
  1
  1
  1
  1
  0
  0
  0
  1
  1
  1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1

In [23]:
vm.path[:, wid]

26-element Array{Int32,1}:
 495346
 496321
 496879
 497185
 497349
 497436
 497483
 497508
 497522
 497530
 497534
 497536
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0

552

In [None]:
import WordEmbeddings: NetworkType

type SemHuff <: NetworkType
    source::GenWordEmbedding
end

In [None]:
using Training
using PooledElements

In [None]:
function initialize_network!(embed::GenWordEmbedding, network_type::SemHuff)
    source_tree = network_type.source.classification_tree
    source_embeddings = network_type.source.embedding
    
    debug("Began SemHuff sorting")
    semtree = semhuff(source_tree, source_embeddings, 30);
    debug("Completed SemHuff sorting")
    debug("Began classification tree creation")
    embed.classification_tree = transform_tree(semtree, 
                            leaf_transform = word->word,
    internal_transform = dummy -> LinearClassifier(2,embed.dim))
    
    embed.codebook = Dict(leaves_of(classification_tree))
    debug("Completed SemHuff Bootstrapping")
    embed
end

In [None]:
sem_ee = deepcopy(ee)
sem_ee.network_type = SemHuff(ee)
sem_ee.embedding = Dict(pstring(word)=>wv for (word, wv) in ee.embedding)
initialize_embedding(sem_ee,sem_ee.init_type)
initialize_network!(sem_ee,sem_ee.network_type)