In [None]:
using Pipe
push!(LOAD_PATH, "../word-embeddings2")
using WordEmbeddings
we = @pipe load_embeddings("../word-embeddings2/word_emb_data/embeddings-scaled.EMBEDDING_SIZE=50.txt") |> WE(_...);


In [277]:
countries = """
France
Germany
England
Britain
China
Australia
Japan
Peru
Brazil
USA
US
United
America
Canada
UK
Asia
Europe
"""

people = """
men
boys
girls
women
man
boy
girl
woman
"""

body_parts = """
hand
foot
arm
leg
eye
ear
tooth
chest
knee
"""

verbs = """
running
jumping
hiking
fishing
sailing
diving
cycling
swimming
golfing
fighting
reading
working
rowing
""" 
words = countries |> split;
words = convert(Vector{String}, words)
xs = eval_word_embeddings(we, words)';


In [269]:
function show_posterior_prob(gmm,xs, words)
    p_ij = gmm[:predict_proba](xs)
    lbl = 
    order = @pipe findmax(p_ij,2)[2] |> map(ind->ind2sub(size(p_ij),ind)[2],_)
    word_indexes = sort([1:length(order);], by=ii->order[ii])
    for ii in word_indexes
        word = rpad(words[ii],8)
        ps = @pipe p_ij[ii,:] |> round(_,2) #|> string |> rpad(_,4,"0")
        print(gmm[:predict](xs[ii,:]))
        print("$word\t|")
        
        for p in ps 
            print(" $p ")
        end
        println()
    end
end

show_posterior_prob (generic function with 1 method)

In [270]:
using PyCall
@pyimport sklearn
@pyimport sklearn.mixture as sklm

In [282]:
gmm=sklm.GMM(n_components=2, covariance_type="full", n_init=105) 
gmm[:fit](xs)
gmm[:weights_]'

1x2 Array{Float64,2}:
 0.764706  0.235294

In [283]:
show_posterior_prob(gmm, xs, words)

[0]France  	| 1.0  0.0 
[0]Germany 	| 1.0  0.0 
[0]England 	| 1.0  0.0 
[0]Britain 	| 1.0  0.0 
[0]China   	| 1.0  0.0 
[0]Australia	| 1.0  0.0 
[0]Japan   	| 1.0  0.0 
[0]Peru    	| 1.0  0.0 
[0]Brazil  	| 1.0  0.0 
[0]America 	| 1.0  0.0 
[0]Canada  	| 1.0  0.0 
[0]Asia    	| 1.0  0.0 
[0]Europe  	| 1.0  0.0 
[1]USA     	| 0.0  1.0 
[1]US      	| 0.0  1.0 
[1]United  	| 0.0  1.0 
[1]UK      	| 0.0  1.0 


In [284]:
x=gmm[:sample]()
lL, p = gmm[:score_samples](x)
println("log_likelyhood:", lL)
println("prediction_prob:", round(p,2))
println("label: ", gmm[:predict](x))
show_best(we, x[:])

log_likelyhood:[85.54531977041921]
prediction_prob:[0.0 1.0]
label: [1]


20x2 Array{Any,2}:
 "US"       0.95
 "4.2"      0.79
 "1.5"      0.77
 "1.2"      0.77
 "2.2"      0.76
 "Chinese"  0.76
 "2.6"      0.75
 "85"       0.75
 "170"      0.75
 "4.5"      0.74
 "110"      0.74
 "89"       0.74
 "HK"       0.73
 "3.9"      0.73
 "270"      0.73
 "34"       0.73
 "2.4"      0.73
 "35"       0.73
 "3.2"      0.73
 "14"       0.73

In [286]:
show_bests(we, gmm[:means_]')[:,1:2:end]

20x2 Array{Any,2}:
 "France"     "UK"           
 "Canada"     "United"       
 "Japan"      "Lippo"        
 "Europe"     "Gulf"         
 "Brazil"     "Canadian"     
 "India"      "European"     
 "Russia"     "Dutch"        
 "Germany"    "Central"      
 "Mexico"     "U.K."         
 "Britain"    "Thai"         
 "China"      "Pacific"      
 "Australia"  "Hungarian"    
 "Thailand"   "Delta"        
 "Malaysia"   "International"
 "America"    "Western"      
 "Turkey"     "Swiss"        
 "Asia"       "Commonwealth" 
 "Finland"    "Chinese"      
 "Pakistan"   "Malaysian"    
 "Sweden"     "Nordic"       