In [1]:
using Pipe
push!(LOAD_PATH, "../word-embeddings2")
using WordEmbeddings
we = @pipe load_embeddings("../word-embeddings2/word_emb_data/hlbl-embeddings-scaled.EMBEDDING_SIZE=100.txt") |> WE(_...);



In [2]:
ENV["COLUMNS"] = 300
ENV["LINES"] = 1000
#ENV["PYTHON"] = "python3"
using PyCall

In [5]:
using Iterators
@pyimport nltk.corpus as nltk_corpus
n_training = 1000
training_sents = @pipe nltk_corpus.brown[:sents]() |> filter(s->0<length(s)<=15, _) |> filter(s->s[end]==".", _) |> take(_,n_training)  |> collect |> convert(Vector{Vector{String}},_);


In [6]:
xs = [eval_word_embeddings(we, sent,false)' for sent in training_sents];
#xs = [x[end:-1:1,:] for x in xs]; #Make sentences start at the end (read from full stop back towards start)
println(size(xs))
println(size(xs[1]))

(1000,)
(14,100)


In [7]:
using PyCall
@pyimport numpy
@pyimport hmmlearn.hmm as hl_hmm
@pyimport warnings
warnings.filterwarnings("ignore")


In [8]:
hmm = hl_hmm.GaussianHMM(12,"full")
hmm[:fit](xs)

PyObject GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
      covars_weight=1,
      init_params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      means_prior=0, means_weight=0, n_components=12, n_iter=10,
      params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      random_state=None, startprob=None, startprob_prior=1.0, thresh=0.01,
      transmat=None, transmat_prior=1.0)

In [28]:
n_states =12
left_right_tran = [ i<=j ? 1./(n_states-i+1):10.0^-16  for i in 1:n_states, j in 1:n_states]
hmm = hl_hmm.GaussianHMM(n_states,"full", transmat=left_right_tran)
hmm[:fit](xs)

PyObject GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
      covars_weight=1,
      init_params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      means_prior=0, means_weight=0, n_components=12, n_iter=10,
      params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      random_state=None, startprob=None, startprob_prior=1.0, thresh=0.01,
      transmat=None, transmat_prior=1.0)

In [36]:
wes, states = hmm[:sample](4)
bests = show_bests(we, wes',1);
@pipe bests[1,1:2:end][:] |> join(_, ' ')

"ran A depot 23"

In [35]:
function get_dotfile(hmm,we, min_start_prob = 0.00, min_trans_prob = 0.0)
    if pyisinstance(hmm,pytypeof(hl_hmm.GMMHMM())) 
        function gmm_cores(gmm)
            gmm_means = @pipe gmm[:means_]' |> round(_,5) |> unique(_,2) # first remove any ~duplicate vectors
            show_bests(we,gmm_means,1)[1,1:2:end] |> unique #then remove any duplicate words 
        end
        node_labels = map(gmm_cores, hmm[:gmms_])
    elseif pyisinstance(hmm,pytypeof(hl_hmm.GaussianHMM())) 
        n_means = 3
        node_labels = show_bests(we, hmm[:means_]',n_means)[1:n_means,1:2:end]
        node_labels= [node_labels[:,ii] for ii in 1:size(node_labels,2)]
    else 
        error("Unknown type of HMM")
    end
    
    prob_from_to = @pipe hmm[:_get_transmat]() |> round(_,2)
    function node_text(index::Int, min_start_prob=0)
        text = @pipe node_labels[index] |> join(_,'/')
        start_prob = @pipe hmm[:startprob_][index] |> round(_,2)
        text*= start_prob>min_start_prob ?  "\nStart Prob: $start_prob":""
        text
    end

    graph_text_buff = IOBuffer()
    println(graph_text_buff,"digraph graphname {")

    n_nodes = size(prob_from_to,1)
    for node_index in 1:n_nodes
        lbl = node_text(node_index, min_start_prob)
        println(graph_text_buff,"n$node_index [label= \"$lbl\"];")
    end

    for from_node in 1:n_nodes
        for to_node in 1:n_nodes
            trans_prob = prob_from_to[from_node,to_node]
            if trans_prob>min_trans_prob
                print(graph_text_buff,"n$from_node->n$to_node [")
                print(graph_text_buff,"label= \"$trans_prob\", ")
                print(graph_text_buff,"penwidth = \"$(10*trans_prob)\", ")
                #print(graph_text_buff,"weight = \"$(int(100*trans_prob))\", ")
                print(graph_text_buff, "color = \"$(rand(1)[1]),1.0,0.5\"")
                println(graph_text_buff,"];")
            end
        end
    end
    println(graph_text_buff,"}")
    graph_text_buff |> takebuf_string 
end
get_dotfile(hmm,we) |> print

digraph graphname {
n1 [label= "field/tests/lines
Start Prob: 0.27"];
n2 [label= "probably/actually/certainly
Start Prob: 0.17"];
n3 [label= "./../?"];
n4 [label= "although/Although/where
Start Prob: 0.1"];
n5 [label= ",/--/;
Start Prob: 0.01"];
n6 [label= "for/from/including
Start Prob: 0.02"];
n7 [label= "was/*UNKNOWN*/are
Start Prob: 0.14"];
n8 [label= "There/there/...there
Start Prob: 0.02"];
n9 [label= "a/A/another
Start Prob: 0.03"];
n10 [label= "in/In/into
Start Prob: 0.02"];
n11 [label= "own/only/and
Start Prob: 0.07"];
n12 [label= "the/The/entire
Start Prob: 0.15"];
n1->n1 [label= "0.23", penwidth = "2.3000000000000003", weight = "23", color = "0.24117317248990688,1.0,0.5"];
n1->n2 [label= "0.11", penwidth = "1.1", weight = "11", color = "0.42962575952414706,1.0,0.5"];
n1->n3 [label= "0.18", penwidth = "1.7999999999999998", weight = "18", color = "0.257379325352332,1.0,0.5"];
n1->n4 [label= "0.02", penwidth = "0.2", weight = "2", color = "0.13985710466575507,1.0,0.5"];
n1->n5 

In [None]:
@pyimport pickle
infile = open("hmm2.pickle","r")
hmm = pickle.load(PyTextIO(infile))


In [None]:
wes = numpy.load("sample.npy")'
show_bests(we,wes,1)[1:2:end]

In [None]:
hmm = hl_hmm.GMMHMM(n_components=3, n_mix=3, covariance_type="full", n_iter=2)

In [None]:
hmm[:fit](xs)