In [26]:
using Pipe
using Iterators
using DataStructures

In [58]:
ps = readcsv("results/language_simple_wikipedia.csv")
phrases = @pipe readall("results/language_simple_wikipedia.phrases.txt") |> split(_,'\n');
if phrases[end]==""
    phrases=phrases[1:end-1]
end;

In [15]:
using PyCall
@pyimport sklearn
@pyimport sklearn.mixture as sklm

In [100]:
gmm=sklm.GMM(n_components=10, covariance_type="full", n_init=105) 
gmm[:fit](ps)
gmm[:weights_]'

1x10 Array{Float64,2}:
 0.0454545  0.136364  0.136364  0.0454545  …  0.0681818  0.113636  0.113636

In [101]:
function show_posterior_prob(gmm,xs, words)
    p_ij = gmm[:predict_proba](xs)
    order = @pipe findmax(p_ij,2)[2] |> map(ind->ind2sub(size(p_ij),ind)[2],_)
    word_indexes = sort([1:length(order);], by=ii->order[ii])
    for ii in word_indexes
        word = rpad(words[ii],8)
        ps = @pipe p_ij[ii,:] |> round(_,2) #|> string |> rpad(_,4,"0")
        print(gmm[:predict](xs[ii,:]))
        print("$word\t|")
        
        for p in ps 
            print(" $p ")
        end
        println()
    end
end

show_posterior_prob (generic function with 1 method)

In [102]:
function show_clusters(gmm,xs, words)
    scores = gmm[:score](xs)
    order = sort([1:length(scores);], by=ii->scores[ii], rev=true)
    clusters = DefaultDict(Int, Vector{String}, ()->String[])
    for ii in order
        word = words[ii]
        cluster =  gmm[:predict](xs[ii,:])[1]
        push!(clusters[cluster], word)
    end
    
    for cluster in keys(clusters)
        println("** $cluster **")
        for word in clusters[cluster]
            println(" - ", word)
        end
        println()
    end
end

show_clusters (generic function with 1 method)

In [103]:
show_clusters(gmm, ps, phrases)

** 0 **
 - *UNKNOWN* as an official second language : up to 300 million .
 - *UNKNOWN* capacity to learn and use language is inherited . normally , all humans are born with this capability .

** 7 **
 - a type of school subject . *UNKNOWN* says that *UNKNOWN* languages are at risk of becoming *UNKNOWN* . *UNKNOWN* of language
 - some people consider musical notation to be a way of writing the musical language . *UNKNOWN* is the language with the most native speakers in the world , but *UNKNOWN* is not really a language .
 - *UNKNOWN* *UNKNOWN* *UNKNOWN* *UNKNOWN* : 390 million native speakers .

** 9 **
 - it follows that language is not just any way of communicating .
 - it is the main second language of the world and the language of science , business , and entertainment . *UNKNOWN* as a first language : 380 million .
 - we call the bigger packets sentences or questions or replies or comments .
 - this is *UNKNOWN* in writing , where the marks are put on the paper or screen in the sa